From 81ea42a2a46fea1f633bc34352fde02f4c412f7c Mon Sep 17 00:00:00 2001 From: Tony Tam <149837+tonytamsf@users.noreply.github.com> Date: Sun, 11 Jul 2021 00:58:54 -0700 Subject: Dedup based on item unique id, media url or title (#4839) --- .../de/danoeh/antennapod/core/storage/DBTasks.java | 35 +++++++-- .../core/syndication/handler/FeedHandler.java | 88 +++++++++++++++++----- 2 files changed, 101 insertions(+), 22 deletions(-) (limited to 'core/src/main/java/de') diff --git a/core/src/main/java/de/danoeh/antennapod/core/storage/DBTasks.java b/core/src/main/java/de/danoeh/antennapod/core/storage/DBTasks.java index 4ccd34e28..ee3cf31a1 100644 --- a/core/src/main/java/de/danoeh/antennapod/core/storage/DBTasks.java +++ b/core/src/main/java/de/danoeh/antennapod/core/storage/DBTasks.java @@ -334,11 +334,36 @@ public final class DBTasks { } /** - * Get a FeedItem by its identifying value. + * Get a FeedItem by its identifying value or download_url. + * For de-duplicating items that are not stored yet, see also FeedHandler.dedupItems */ - private static FeedItem searchFeedItemByIdentifyingValue(Feed feed, String identifier) { + private static FeedItem searchFeedItemByIdentifyingValue(Feed feed, FeedItem searchItem) { for (FeedItem item : feed.getItems()) { - if (TextUtils.equals(item.getIdentifyingValue(), identifier)) { + if (TextUtils.equals(item.getIdentifyingValue(), searchItem.getIdentifyingValue())) { + return item; + } + } + // Did not find item with same ID. Try to guess duplicates based on other metadata. + for (FeedItem item : feed.getItems()) { + if (item.getMedia() == null || TextUtils.isEmpty(item.getMedia().getStreamUrl())) { + continue; + } + + boolean isDuplicate = false; + if (TextUtils.equals(item.getMedia().getStreamUrl(), searchItem.getMedia().getStreamUrl())) { + Log.d(TAG, "Removing duplicate episode stream url " + item.getMedia().getStreamUrl()); + isDuplicate = true; + } else if (TextUtils.equals(item.getTitle(), searchItem.getTitle()) + && item.getPubDate().equals(searchItem.getPubDate())) { + Log.d(TAG, "Removing duplicate episode title + pubDate " + item.getTitle() + " " + item.getPubDate()); + isDuplicate = true; + } + if (isDuplicate) { + DBWriter.addDownloadStatus(new DownloadStatus(feed, + searchItem.getTitle(), DownloadError.ERROR_PARSER_EXCEPTION, false, + "The podcast host changed the ID of an existing episode instead of just " + + "updating the episode itself. AntennaPod attempted to repair it.", false)); + item.setItemIdentifier(searchItem.getItemIdentifier()); return item; } } @@ -411,7 +436,7 @@ public final class DBTasks { // Look for new or updated Items for (int idx = 0; idx < newFeed.getItems().size(); idx++) { final FeedItem item = newFeed.getItems().get(idx); - FeedItem oldItem = searchFeedItemByIdentifyingValue(savedFeed, item.getIdentifyingValue()); + FeedItem oldItem = searchFeedItemByIdentifyingValue(savedFeed, item); if (oldItem == null) { // item is new item.setFeed(savedFeed); @@ -445,7 +470,7 @@ public final class DBTasks { Iterator it = savedFeed.getItems().iterator(); while (it.hasNext()) { FeedItem feedItem = it.next(); - if (searchFeedItemByIdentifyingValue(newFeed, feedItem.getIdentifyingValue()) == null) { + if (searchFeedItemByIdentifyingValue(newFeed, feedItem) == null) { unlistedItems.add(feedItem); it.remove(); } diff --git a/core/src/main/java/de/danoeh/antennapod/core/syndication/handler/FeedHandler.java b/core/src/main/java/de/danoeh/antennapod/core/syndication/handler/FeedHandler.java index c9e6ce5fa..fb28d58c4 100644 --- a/core/src/main/java/de/danoeh/antennapod/core/syndication/handler/FeedHandler.java +++ b/core/src/main/java/de/danoeh/antennapod/core/syndication/handler/FeedHandler.java @@ -1,5 +1,8 @@ package de.danoeh.antennapod.core.syndication.handler; +import android.text.TextUtils; +import android.util.Log; + import org.apache.commons.io.input.XmlStreamReader; import org.xml.sax.InputSource; import org.xml.sax.SAXException; @@ -7,30 +10,81 @@ import org.xml.sax.SAXException; import java.io.File; import java.io.IOException; import java.io.Reader; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import de.danoeh.antennapod.model.feed.Feed; +import de.danoeh.antennapod.model.feed.FeedItem; public class FeedHandler { + private static final String TAG = "FeedHandler"; + + public FeedHandlerResult parseFeed(Feed feed) throws SAXException, IOException, + ParserConfigurationException, UnsupportedFeedtypeException { + TypeGetter tg = new TypeGetter(); + TypeGetter.Type type = tg.getType(feed); + SyndHandler handler = new SyndHandler(feed, type); + + SAXParserFactory factory = SAXParserFactory.newInstance(); + factory.setNamespaceAware(true); + SAXParser saxParser = factory.newSAXParser(); + File file = new File(feed.getFile_url()); + Reader inputStreamReader = new XmlStreamReader(file); + InputSource inputSource = new InputSource(inputStreamReader); + + saxParser.parse(inputSource, handler); + inputStreamReader.close(); + feed.setItems(dedupItems(feed.getItems())); + return new FeedHandlerResult(handler.state.feed, handler.state.alternateUrls); + } + + /** + * For updating items that are stored in the database, see also: DBTasks.searchFeedItemByIdentifyingValue + */ + public static List dedupItems(List items) { + if (items == null) { + return null; + } + List list = new ArrayList<>(items); + Set seen = new HashSet<>(); + Iterator it = list.iterator(); + while (it.hasNext()) { + FeedItem item = it.next(); + if (seen.contains(item.getItemIdentifier())) { + Log.d(TAG, "Removing duplicate episode guid " + item.getItemIdentifier()); + it.remove(); + continue; + } - public FeedHandlerResult parseFeed(Feed feed) throws SAXException, IOException, - ParserConfigurationException, UnsupportedFeedtypeException { - TypeGetter tg = new TypeGetter(); - TypeGetter.Type type = tg.getType(feed); - SyndHandler handler = new SyndHandler(feed, type); - - SAXParserFactory factory = SAXParserFactory.newInstance(); - factory.setNamespaceAware(true); - SAXParser saxParser = factory.newSAXParser(); - File file = new File(feed.getFile_url()); - Reader inputStreamReader = new XmlStreamReader(file); - InputSource inputSource = new InputSource(inputStreamReader); - - saxParser.parse(inputSource, handler); - inputStreamReader.close(); - return new FeedHandlerResult(handler.state.feed, handler.state.alternateUrls); - } + if (item.getMedia() == null || TextUtils.isEmpty(item.getMedia().getStreamUrl())) { + continue; + } + if (seen.contains(item.getMedia().getStreamUrl())) { + Log.d(TAG, "Removing duplicate episode stream url " + item.getMedia().getStreamUrl()); + it.remove(); + } else { + seen.add(item.getMedia().getStreamUrl()); + if (TextUtils.isEmpty(item.getTitle()) || TextUtils.isEmpty(item.getPubDate().toString())) { + continue; + } + if (!seen.contains(item.getTitle() + item.getPubDate().toString())) { + seen.add(item.getTitle() + item.getPubDate().toString()); + } else { + Log.d(TAG, "Removing duplicate episode title and pubDate " + + item.getTitle() + + " " + item.getPubDate()); + it.remove(); + } + } + seen.add(item.getItemIdentifier()); + } + return list; + } } -- cgit v1.2.3