diff options
author | ByteHamster <ByteHamster@users.noreply.github.com> | 2021-09-06 17:59:17 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-09-06 17:59:17 +0200 |
commit | b36cdb0c4ecb987b1a8e0168c15552c7c11d03a1 (patch) | |
tree | 02c4645f0ef82fb76d7c2fe575df8aa3ac33d130 /core/src/main/java/de | |
parent | b9f578ed5c83dff7ebf70e2fb5d6ded9c9d4482f (diff) | |
download | AntennaPod-b36cdb0c4ecb987b1a8e0168c15552c7c11d03a1.zip |
Improvements related to duplicate detection (#5387)
* Move duplicate detection to one single place
* Canonicalize some common characters that are often confused
* Assume same episode even when date is off by 1 week
* Display duplicate detection as warning, not error
Diffstat (limited to 'core/src/main/java/de')
-rw-r--r-- | core/src/main/java/de/danoeh/antennapod/core/storage/DBTasks.java | 110 | ||||
-rw-r--r-- | core/src/main/java/de/danoeh/antennapod/core/util/DownloadError.java | 3 |
2 files changed, 73 insertions, 40 deletions
diff --git a/core/src/main/java/de/danoeh/antennapod/core/storage/DBTasks.java b/core/src/main/java/de/danoeh/antennapod/core/storage/DBTasks.java index 52966c3f6..9dd979dc7 100644 --- a/core/src/main/java/de/danoeh/antennapod/core/storage/DBTasks.java +++ b/core/src/main/java/de/danoeh/antennapod/core/storage/DBTasks.java @@ -335,59 +335,56 @@ public final class DBTasks { } /** - * Get a FeedItem by its identifying value or download_url. - * For de-duplicating items that are not stored yet, see also FeedHandler.dedupItems + * Get a FeedItem by its identifying value. */ - private static FeedItem searchFeedItemByIdentifyingValue(Context context, Feed feed, FeedItem searchItem) { - for (FeedItem item : feed.getItems()) { + private static FeedItem searchFeedItemByIdentifyingValue(List<FeedItem> items, FeedItem searchItem) { + for (FeedItem item : items) { if (TextUtils.equals(item.getIdentifyingValue(), searchItem.getIdentifyingValue())) { return item; } } - // Did not find item with same ID. Try to guess duplicates based on other metadata. - for (FeedItem item : feed.getItems()) { - boolean isDuplicate = false; + return null; + } + + /** + * Guess if one of the items could actually mean the searched item, even if it uses another identifying value. + * This is to work around podcasters breaking their GUIDs. + */ + private static FeedItem searchFeedItemGuessDuplicate(List<FeedItem> items, FeedItem searchItem) { + for (FeedItem item : items) { if ((item.getMedia() != null) && (searchItem.getMedia() != null) && !TextUtils.isEmpty(item.getMedia().getStreamUrl()) && !TextUtils.isEmpty(searchItem.getMedia().getStreamUrl()) && TextUtils.equals(item.getMedia().getStreamUrl(), searchItem.getMedia().getStreamUrl())) { - Log.d(TAG, "Removing duplicate episode stream url " + item.getMedia().getStreamUrl()); - isDuplicate = true; - } else if (TextUtils.equals(item.getTitle(), searchItem.getTitle())) { - Log.d(TAG, "Found same title. Checking pubdate: " + item.getTitle()); + return item; + } else if (titlesLookSimilar(item.getTitle(), searchItem.getTitle())) { long dateOriginal = item.getPubDate().getTime(); long dateNew = searchItem.getPubDate() == null ? 0 : searchItem.getPubDate().getTime(); - if (Math.abs(dateOriginal - dateNew) < 24L * 3600L * 1000L) { // Same day - Log.d(TAG, "Same pubDate. Removing. " + item.getPubDate() + ", " + searchItem.getPubDate()); - isDuplicate = true; + if (Math.abs(dateOriginal - dateNew) < 7L * 24L * 3600L * 1000L) { // Same week + return item; } } - if (isDuplicate) { - DBWriter.addDownloadStatus(new DownloadStatus(feed, - searchItem.getTitle(), DownloadError.ERROR_PARSER_EXCEPTION, false, - "The podcast host changed the ID of an existing episode instead of just " - + "updating the episode itself. AntennaPod attempted to repair it.\n\n" - + "{" + item.getTitle() + "} with ID " + item.getItemIdentifier() - + " seems to be the same as {" + searchItem.getTitle() + "} with ID " - + searchItem.getItemIdentifier(), false)); - item.setItemIdentifier(searchItem.getItemIdentifier()); - - if (item.isPlayed() && item.getMedia() != null) { - EpisodeAction action = new EpisodeAction.Builder(item, EpisodeAction.PLAY) - .currentTimestamp() - .started(item.getMedia().getDuration() / 1000) - .position(item.getMedia().getDuration() / 1000) - .total(item.getMedia().getDuration() / 1000) - .build(); - SyncService.enqueueEpisodeAction(context, action); - } - return item; - } } return null; } + private static boolean titlesLookSimilar(String title1, String title2) { + if (TextUtils.isEmpty(title1) || TextUtils.isEmpty(title2)) { + return false; + } + return canonicalizeTitle(title1).equals(canonicalizeTitle(title2)); + } + + private static String canonicalizeTitle(String title) { + return title + .trim() + .replace('“', '"') + .replace('”', '"') + .replace('„', '"') + .replace('—', '-'); + } + /** * Adds new Feeds to the database or updates the old versions if they already exists. If another Feed with the same * identifying value already exists, this method will add new FeedItems from the new Feed to the existing Feed. @@ -454,8 +451,45 @@ public final class DBTasks { // Look for new or updated Items for (int idx = 0; idx < newFeed.getItems().size(); idx++) { final FeedItem item = newFeed.getItems().get(idx); - FeedItem oldItem = searchFeedItemByIdentifyingValue(context, savedFeed, item); + + if (item != searchFeedItemGuessDuplicate(newFeed.getItems(), item)) { + // Canonical episode is the first one returned (usually oldest) + DBWriter.addDownloadStatus(new DownloadStatus(savedFeed, + item.getTitle(), DownloadError.ERROR_PARSER_EXCEPTION_DUPLICATE, false, + "The podcast host appears to have added the same episode twice. " + + "AntennaPod attempted to repair it.", false)); + continue; + } + + FeedItem oldItem = searchFeedItemByIdentifyingValue(savedFeed.getItems(), item); if (oldItem == null) { + oldItem = searchFeedItemGuessDuplicate(savedFeed.getItems(), item); + if (oldItem != null) { + Log.d(TAG, "Repaired duplicate: " + oldItem + ", " + item); + DBWriter.addDownloadStatus(new DownloadStatus(savedFeed, + item.getTitle(), DownloadError.ERROR_PARSER_EXCEPTION_DUPLICATE, false, + "The podcast host changed the ID of an existing episode instead of just " + + "updating the episode itself. AntennaPod attempted to repair it.\n\n" + + "{" + oldItem.getTitle() + "} with ID " + oldItem.getItemIdentifier() + + " seems to be the same as {" + item.getTitle() + "} with ID " + + item.getItemIdentifier(), false)); + oldItem.setItemIdentifier(item.getItemIdentifier()); + + if (oldItem.isPlayed() && oldItem.getMedia() != null) { + EpisodeAction action = new EpisodeAction.Builder(oldItem, EpisodeAction.PLAY) + .currentTimestamp() + .started(oldItem.getMedia().getDuration() / 1000) + .position(oldItem.getMedia().getDuration() / 1000) + .total(oldItem.getMedia().getDuration() / 1000) + .build(); + SyncService.enqueueEpisodeAction(context, action); + } + } + } + + if (oldItem != null) { + oldItem.updateFromOther(item); + } else { // item is new item.setFeed(savedFeed); @@ -477,8 +511,6 @@ public final class DBTasks { + " new, prior most recent date = " + priorMostRecentDate); item.setNew(); } - } else { - oldItem.updateFromOther(item); } } @@ -487,7 +519,7 @@ public final class DBTasks { Iterator<FeedItem> it = savedFeed.getItems().iterator(); while (it.hasNext()) { FeedItem feedItem = it.next(); - if (searchFeedItemByIdentifyingValue(context, newFeed, feedItem) == null) { + if (searchFeedItemByIdentifyingValue(newFeed.getItems(), feedItem) == null) { unlistedItems.add(feedItem); it.remove(); } diff --git a/core/src/main/java/de/danoeh/antennapod/core/util/DownloadError.java b/core/src/main/java/de/danoeh/antennapod/core/util/DownloadError.java index 9c4a61cd8..9e5282576 100644 --- a/core/src/main/java/de/danoeh/antennapod/core/util/DownloadError.java +++ b/core/src/main/java/de/danoeh/antennapod/core/util/DownloadError.java @@ -27,7 +27,8 @@ public enum DownloadError { ERROR_IO_BLOCKED(18, R.string.download_error_blocked), ERROR_UNSUPPORTED_TYPE_HTML(19, R.string.download_error_unsupported_type_html), ERROR_NOT_FOUND(20, R.string.download_error_not_found), - ERROR_CERTIFICATE(21, R.string.download_error_certificate); + ERROR_CERTIFICATE(21, R.string.download_error_certificate), + ERROR_PARSER_EXCEPTION_DUPLICATE(22, R.string.download_error_parser_exception); private final int code; private final int resId; |