diff options
author | ByteHamster <ByteHamster@users.noreply.github.com> | 2024-03-24 18:04:39 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-24 18:04:39 +0100 |
commit | 7b390f1c9299d644ad0339e83724e1457c150410 (patch) | |
tree | 8e74d59cc8e3447b81817759869b7d5af1345b12 | |
parent | 084b9c231744a5657a620087b4b560db38b66ab8 (diff) | |
download | AntennaPod-7b390f1c9299d644ad0339e83724e1457c150410.zip |
Speed up feed parsing (#7023)
AntennaPod is quite slow with huge feeds. The reason is that we have a bunch of workarounds for misbehaving feeds that also make it slower to work with feeds that do not misbehave.
Changes:
- Only start guessing duplicate episodes when no "proper" match is found
- Only parse non-html as HTML for attributes that really need it
- Do not log failed Long parsing when size is not specified
- Try to parse dates with RFC822 first before falling back to workarounds for other formats
I ran a benchmark with "Stuff you should know" (for which the workarounds are not needed) containing 2k episodes. Includes download of 8MB of feed XML (~5 seconds), debug build.
Before: 44 seconds, after: 13 seconds ==> 3.4 times faster feed refresh
6 files changed, 36 insertions, 15 deletions
diff --git a/core/src/main/java/de/danoeh/antennapod/core/storage/DBTasks.java b/core/src/main/java/de/danoeh/antennapod/core/storage/DBTasks.java index 1fb0991dd..883aefee8 100644 --- a/core/src/main/java/de/danoeh/antennapod/core/storage/DBTasks.java +++ b/core/src/main/java/de/danoeh/antennapod/core/storage/DBTasks.java @@ -171,6 +171,13 @@ public final class DBTasks { * This is to work around podcasters breaking their GUIDs. */ private static FeedItem searchFeedItemGuessDuplicate(List<FeedItem> items, FeedItem searchItem) { + // First, see if it is a well-behaving feed that contains an item with the same identifier + for (FeedItem item : items) { + if (FeedItemDuplicateGuesser.sameAndNotEmpty(item.getItemIdentifier(), searchItem.getItemIdentifier())) { + return item; + } + } + // Not found yet, start more expensive guessing for (FeedItem item : items) { if (FeedItemDuplicateGuesser.seemDuplicates(item, searchItem)) { return item; diff --git a/core/src/main/java/de/danoeh/antennapod/core/storage/FeedItemDuplicateGuesser.java b/core/src/main/java/de/danoeh/antennapod/core/storage/FeedItemDuplicateGuesser.java index 88f896187..1eb8b0577 100644 --- a/core/src/main/java/de/danoeh/antennapod/core/storage/FeedItemDuplicateGuesser.java +++ b/core/src/main/java/de/danoeh/antennapod/core/storage/FeedItemDuplicateGuesser.java @@ -31,7 +31,7 @@ public class FeedItemDuplicateGuesser { && mimeTypeLooksSimilar(media1, media2); } - private static boolean sameAndNotEmpty(String string1, String string2) { + public static boolean sameAndNotEmpty(String string1, String string2) { if (TextUtils.isEmpty(string1) || TextUtils.isEmpty(string2)) { return false; } diff --git a/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Itunes.java b/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Itunes.java index cb4bcb8a7..e76b678f1 100644 --- a/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Itunes.java +++ b/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Itunes.java @@ -51,12 +51,12 @@ public class Itunes extends Namespace { } String content = state.getContentBuf().toString(); - String contentFromHtml = HtmlCompat.fromHtml(content, HtmlCompat.FROM_HTML_MODE_COMPACT).toString(); if (TextUtils.isEmpty(content)) { return; } if (AUTHOR.equals(localName) && state.getFeed() != null && state.getTagstack().size() <= 3) { + String contentFromHtml = HtmlCompat.fromHtml(content, HtmlCompat.FROM_HTML_MODE_COMPACT).toString(); state.getFeed().setAuthor(contentFromHtml); } else if (DURATION.equals(localName)) { try { diff --git a/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Media.java b/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Media.java index 85cafea84..9336423c6 100644 --- a/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Media.java +++ b/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Media.java @@ -73,10 +73,12 @@ public class Media extends Namespace { && url != null && validTypeMedia) { long size = 0; String sizeStr = attributes.getValue(SIZE); - try { - size = Long.parseLong(sizeStr); - } catch (NumberFormatException e) { - Log.e(TAG, "Size \"" + sizeStr + "\" could not be parsed."); + if (!TextUtils.isEmpty(sizeStr)) { + try { + size = Long.parseLong(sizeStr); + } catch (NumberFormatException e) { + Log.e(TAG, "Size \"" + sizeStr + "\" could not be parsed."); + } } int durationMs = 0; diff --git a/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Rss20.java b/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Rss20.java index b19500895..fd5e91e9d 100644 --- a/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Rss20.java +++ b/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Rss20.java @@ -92,7 +92,6 @@ public class Rss20 extends Namespace { } else if (state.getTagstack().size() >= 2 && state.getContentBuf() != null) { String contentRaw = state.getContentBuf().toString(); String content = SyndStringUtils.trimAllWhitespace(contentRaw); - String contentFromHtml = HtmlCompat.fromHtml(content, HtmlCompat.FROM_HTML_MODE_COMPACT).toString(); SyndElement topElement = state.getTagstack().peek(); String top = topElement.getName(); SyndElement secondElement = state.getSecondTag(); @@ -108,6 +107,8 @@ public class Rss20 extends Namespace { state.getCurrentItem().setItemIdentifier(contentRaw); } } else if (TITLE.equals(top)) { + // Calling fromHtml only if needed because it is slow for huge feeds + String contentFromHtml = HtmlCompat.fromHtml(content, HtmlCompat.FROM_HTML_MODE_COMPACT).toString(); if (ITEM.equals(second) && state.getCurrentItem() != null) { state.getCurrentItem().setTitle(contentFromHtml); } else if (CHANNEL.equals(second) && state.getFeed() != null) { @@ -128,6 +129,8 @@ public class Rss20 extends Namespace { } } else if (DESCR.equals(localName)) { if (CHANNEL.equals(second) && state.getFeed() != null) { + // Calling fromHtml only if needed because it is slow for huge feeds + String contentFromHtml = HtmlCompat.fromHtml(content, HtmlCompat.FROM_HTML_MODE_COMPACT).toString(); state.getFeed().setDescription(contentFromHtml); } else if (ITEM.equals(second) && state.getCurrentItem() != null) { state.getCurrentItem().setDescriptionIfLonger(content); // fromHtml here breaks \n when not html diff --git a/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/util/DateUtils.java b/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/util/DateUtils.java index 9b7f48769..4d9f68905 100644 --- a/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/util/DateUtils.java +++ b/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/util/DateUtils.java @@ -5,6 +5,7 @@ import android.util.Log; import androidx.annotation.Nullable; import org.apache.commons.lang3.StringUtils; +import java.text.ParseException; import java.text.ParsePosition; import java.text.SimpleDateFormat; import java.util.Date; @@ -14,19 +15,27 @@ import java.util.TimeZone; /** * Parses several date formats. */ -public class DateUtils { - - private DateUtils() { - - } - +public abstract class DateUtils { private static final String TAG = "DateUtils"; - private static final TimeZone defaultTimezone = TimeZone.getTimeZone("GMT"); + private static final TimeZone TIME_ZONE_GMT = TimeZone.getTimeZone("GMT"); + private static final ThreadLocal<SimpleDateFormat> RFC822_DATE_FORMAT = new ThreadLocal<>() { + @Override + protected SimpleDateFormat initialValue() { + SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss Z", Locale.US); + dateFormat.setTimeZone(TIME_ZONE_GMT); + return dateFormat; + } + }; public static Date parse(final String input) { if (input == null) { throw new IllegalArgumentException("Date must not be null"); } + try { + return RFC822_DATE_FORMAT.get().parse(input); + } catch (ParseException ignored) { + // Feed not following the specification? Now start all our expensive workarounds. + } String date = input.trim().replace('/', '-').replaceAll("( ){2,}+", " "); // remove colon from timezone to avoid differences between Android and Java SimpleDateFormat @@ -97,7 +106,7 @@ public class DateUtils { SimpleDateFormat parser = new SimpleDateFormat("", Locale.US); parser.setLenient(false); - parser.setTimeZone(defaultTimezone); + parser.setTimeZone(TIME_ZONE_GMT); ParsePosition pos = new ParsePosition(0); for (String pattern : patterns) { |