From 81ea42a2a46fea1f633bc34352fde02f4c412f7c Mon Sep 17 00:00:00 2001 From: Tony Tam <149837+tonytamsf@users.noreply.github.com> Date: Sun, 11 Jul 2021 00:58:54 -0700 Subject: Dedup based on item unique id, media url or title (#4839) --- .../de/danoeh/antennapod/core/storage/DBTasks.java | 35 +++++++-- .../core/syndication/handler/FeedHandler.java | 88 +++++++++++++++++----- .../antennapod/core/storage/DbCleanupTests.java | 2 +- .../core/storage/DbNullCleanupAlgorithmTest.java | 2 +- .../antennapod/core/storage/DbTasksTest.java | 34 ++++++++- 5 files changed, 136 insertions(+), 25 deletions(-) (limited to 'core/src') diff --git a/core/src/main/java/de/danoeh/antennapod/core/storage/DBTasks.java b/core/src/main/java/de/danoeh/antennapod/core/storage/DBTasks.java index 4ccd34e28..ee3cf31a1 100644 --- a/core/src/main/java/de/danoeh/antennapod/core/storage/DBTasks.java +++ b/core/src/main/java/de/danoeh/antennapod/core/storage/DBTasks.java @@ -334,11 +334,36 @@ public final class DBTasks { } /** - * Get a FeedItem by its identifying value. + * Get a FeedItem by its identifying value or download_url. + * For de-duplicating items that are not stored yet, see also FeedHandler.dedupItems */ - private static FeedItem searchFeedItemByIdentifyingValue(Feed feed, String identifier) { + private static FeedItem searchFeedItemByIdentifyingValue(Feed feed, FeedItem searchItem) { for (FeedItem item : feed.getItems()) { - if (TextUtils.equals(item.getIdentifyingValue(), identifier)) { + if (TextUtils.equals(item.getIdentifyingValue(), searchItem.getIdentifyingValue())) { + return item; + } + } + // Did not find item with same ID. Try to guess duplicates based on other metadata. + for (FeedItem item : feed.getItems()) { + if (item.getMedia() == null || TextUtils.isEmpty(item.getMedia().getStreamUrl())) { + continue; + } + + boolean isDuplicate = false; + if (TextUtils.equals(item.getMedia().getStreamUrl(), searchItem.getMedia().getStreamUrl())) { + Log.d(TAG, "Removing duplicate episode stream url " + item.getMedia().getStreamUrl()); + isDuplicate = true; + } else if (TextUtils.equals(item.getTitle(), searchItem.getTitle()) + && item.getPubDate().equals(searchItem.getPubDate())) { + Log.d(TAG, "Removing duplicate episode title + pubDate " + item.getTitle() + " " + item.getPubDate()); + isDuplicate = true; + } + if (isDuplicate) { + DBWriter.addDownloadStatus(new DownloadStatus(feed, + searchItem.getTitle(), DownloadError.ERROR_PARSER_EXCEPTION, false, + "The podcast host changed the ID of an existing episode instead of just " + + "updating the episode itself. AntennaPod attempted to repair it.", false)); + item.setItemIdentifier(searchItem.getItemIdentifier()); return item; } } @@ -411,7 +436,7 @@ public final class DBTasks { // Look for new or updated Items for (int idx = 0; idx < newFeed.getItems().size(); idx++) { final FeedItem item = newFeed.getItems().get(idx); - FeedItem oldItem = searchFeedItemByIdentifyingValue(savedFeed, item.getIdentifyingValue()); + FeedItem oldItem = searchFeedItemByIdentifyingValue(savedFeed, item); if (oldItem == null) { // item is new item.setFeed(savedFeed); @@ -445,7 +470,7 @@ public final class DBTasks { Iterator it = savedFeed.getItems().iterator(); while (it.hasNext()) { FeedItem feedItem = it.next(); - if (searchFeedItemByIdentifyingValue(newFeed, feedItem.getIdentifyingValue()) == null) { + if (searchFeedItemByIdentifyingValue(newFeed, feedItem) == null) { unlistedItems.add(feedItem); it.remove(); } diff --git a/core/src/main/java/de/danoeh/antennapod/core/syndication/handler/FeedHandler.java b/core/src/main/java/de/danoeh/antennapod/core/syndication/handler/FeedHandler.java index c9e6ce5fa..fb28d58c4 100644 --- a/core/src/main/java/de/danoeh/antennapod/core/syndication/handler/FeedHandler.java +++ b/core/src/main/java/de/danoeh/antennapod/core/syndication/handler/FeedHandler.java @@ -1,5 +1,8 @@ package de.danoeh.antennapod.core.syndication.handler; +import android.text.TextUtils; +import android.util.Log; + import org.apache.commons.io.input.XmlStreamReader; import org.xml.sax.InputSource; import org.xml.sax.SAXException; @@ -7,30 +10,81 @@ import org.xml.sax.SAXException; import java.io.File; import java.io.IOException; import java.io.Reader; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import de.danoeh.antennapod.model.feed.Feed; +import de.danoeh.antennapod.model.feed.FeedItem; public class FeedHandler { + private static final String TAG = "FeedHandler"; + + public FeedHandlerResult parseFeed(Feed feed) throws SAXException, IOException, + ParserConfigurationException, UnsupportedFeedtypeException { + TypeGetter tg = new TypeGetter(); + TypeGetter.Type type = tg.getType(feed); + SyndHandler handler = new SyndHandler(feed, type); + + SAXParserFactory factory = SAXParserFactory.newInstance(); + factory.setNamespaceAware(true); + SAXParser saxParser = factory.newSAXParser(); + File file = new File(feed.getFile_url()); + Reader inputStreamReader = new XmlStreamReader(file); + InputSource inputSource = new InputSource(inputStreamReader); + + saxParser.parse(inputSource, handler); + inputStreamReader.close(); + feed.setItems(dedupItems(feed.getItems())); + return new FeedHandlerResult(handler.state.feed, handler.state.alternateUrls); + } + + /** + * For updating items that are stored in the database, see also: DBTasks.searchFeedItemByIdentifyingValue + */ + public static List dedupItems(List items) { + if (items == null) { + return null; + } + List list = new ArrayList<>(items); + Set seen = new HashSet<>(); + Iterator it = list.iterator(); + while (it.hasNext()) { + FeedItem item = it.next(); + if (seen.contains(item.getItemIdentifier())) { + Log.d(TAG, "Removing duplicate episode guid " + item.getItemIdentifier()); + it.remove(); + continue; + } - public FeedHandlerResult parseFeed(Feed feed) throws SAXException, IOException, - ParserConfigurationException, UnsupportedFeedtypeException { - TypeGetter tg = new TypeGetter(); - TypeGetter.Type type = tg.getType(feed); - SyndHandler handler = new SyndHandler(feed, type); - - SAXParserFactory factory = SAXParserFactory.newInstance(); - factory.setNamespaceAware(true); - SAXParser saxParser = factory.newSAXParser(); - File file = new File(feed.getFile_url()); - Reader inputStreamReader = new XmlStreamReader(file); - InputSource inputSource = new InputSource(inputStreamReader); - - saxParser.parse(inputSource, handler); - inputStreamReader.close(); - return new FeedHandlerResult(handler.state.feed, handler.state.alternateUrls); - } + if (item.getMedia() == null || TextUtils.isEmpty(item.getMedia().getStreamUrl())) { + continue; + } + if (seen.contains(item.getMedia().getStreamUrl())) { + Log.d(TAG, "Removing duplicate episode stream url " + item.getMedia().getStreamUrl()); + it.remove(); + } else { + seen.add(item.getMedia().getStreamUrl()); + if (TextUtils.isEmpty(item.getTitle()) || TextUtils.isEmpty(item.getPubDate().toString())) { + continue; + } + if (!seen.contains(item.getTitle() + item.getPubDate().toString())) { + seen.add(item.getTitle() + item.getPubDate().toString()); + } else { + Log.d(TAG, "Removing duplicate episode title and pubDate " + + item.getTitle() + + " " + item.getPubDate()); + it.remove(); + } + } + seen.add(item.getItemIdentifier()); + } + return list; + } } diff --git a/core/src/test/java/de/danoeh/antennapod/core/storage/DbCleanupTests.java b/core/src/test/java/de/danoeh/antennapod/core/storage/DbCleanupTests.java index 1c4c92574..b5c2e5c73 100644 --- a/core/src/test/java/de/danoeh/antennapod/core/storage/DbCleanupTests.java +++ b/core/src/test/java/de/danoeh/antennapod/core/storage/DbCleanupTests.java @@ -134,7 +134,7 @@ public class DbCleanupTests { if (itemState == FeedItem.PLAYED) { playbackCompletionDate = itemDate; } - FeedItem item = new FeedItem(0, "title", "id", "link", itemDate, itemState, feed); + FeedItem item = new FeedItem(0, "title", "id" + i, "link", itemDate, itemState, feed); File f = new File(destFolder, "file " + i); assertTrue(f.createNewFile()); diff --git a/core/src/test/java/de/danoeh/antennapod/core/storage/DbNullCleanupAlgorithmTest.java b/core/src/test/java/de/danoeh/antennapod/core/storage/DbNullCleanupAlgorithmTest.java index 733318724..6485c9515 100644 --- a/core/src/test/java/de/danoeh/antennapod/core/storage/DbNullCleanupAlgorithmTest.java +++ b/core/src/test/java/de/danoeh/antennapod/core/storage/DbNullCleanupAlgorithmTest.java @@ -92,7 +92,7 @@ public class DbNullCleanupAlgorithmTest { feed.setItems(items); List files = new ArrayList<>(); for (int i = 0; i < numItems; i++) { - FeedItem item = new FeedItem(0, "title", "id", "link", new Date(), FeedItem.PLAYED, feed); + FeedItem item = new FeedItem(0, "title", "id" + i, "link", new Date(), FeedItem.PLAYED, feed); File f = new File(destFolder, "file " + i); assertTrue(f.createNewFile()); diff --git a/core/src/test/java/de/danoeh/antennapod/core/storage/DbTasksTest.java b/core/src/test/java/de/danoeh/antennapod/core/storage/DbTasksTest.java index 9e3dca0f7..0d98ba294 100644 --- a/core/src/test/java/de/danoeh/antennapod/core/storage/DbTasksTest.java +++ b/core/src/test/java/de/danoeh/antennapod/core/storage/DbTasksTest.java @@ -197,6 +197,38 @@ public class DbTasksTest { assertEquals(8, feedFromDB.getItems().size()); // 10 - 2 = 8 items } + @Test + public void testUpdateFeedSetDuplicate() { + final Feed feed = new Feed("url", null, "title"); + feed.setItems(new ArrayList<>()); + for (int i = 0; i < 10; i++) { + FeedItem item = + new FeedItem(0, "item " + i, "id " + i, "link " + i, new Date(i), FeedItem.PLAYED, feed); + FeedMedia media = new FeedMedia(item, "download url " + i, 123, "media/mp3"); + item.setMedia(media); + feed.getItems().add(item); + } + PodDBAdapter adapter = PodDBAdapter.getInstance(); + adapter.open(); + adapter.setCompleteFeed(feed); + adapter.close(); + + // change the guid of the first item, but leave the download url the same + FeedItem item = feed.getItemAtIndex(0); + item.setItemIdentifier("id 0-duplicate"); + item.setTitle("item 0 duplicate"); + Feed newFeed = DBTasks.updateFeed(context, feed, false); + assertEquals(10, newFeed.getItems().size()); // id 1-duplicate replaces because the stream url is the same + + Feed feedFromDB = DBReader.getFeed(newFeed.getId()); + assertEquals(10, feedFromDB.getItems().size()); // id1-duplicate should override id 1 + + FeedItem updatedItem = feedFromDB.getItemAtIndex(9); + assertEquals("item 0 duplicate", updatedItem.getTitle()); + assertEquals("id 0-duplicate", updatedItem.getItemIdentifier()); // Should use the new ID for sync etc + } + + @SuppressWarnings("SameParameterValue") private void updatedFeedTest(final Feed newFeed, long feedID, List itemIDs, int numItemsOld, int numItemsNew) { @@ -285,7 +317,7 @@ public class DbTasksTest { if (numFeedItems > 0) { List items = new ArrayList<>(numFeedItems); for (int i = 1; i <= numFeedItems; i++) { - FeedItem item = new FeedItem(0, "item " + i + " of " + title, "id", "link", + FeedItem item = new FeedItem(0, "item " + i + " of " + title, "id" + title + i, "link", new Date(), FeedItem.UNPLAYED, feed); items.add(item); } -- cgit v1.2.3