summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorByteHamster <ByteHamster@users.noreply.github.com>2024-03-24 18:04:39 +0100
committerGitHub <noreply@github.com>2024-03-24 18:04:39 +0100
commit7b390f1c9299d644ad0339e83724e1457c150410 (patch)
tree8e74d59cc8e3447b81817759869b7d5af1345b12
parent084b9c231744a5657a620087b4b560db38b66ab8 (diff)
downloadAntennaPod-7b390f1c9299d644ad0339e83724e1457c150410.zip
Speed up feed parsing (#7023)
AntennaPod is quite slow with huge feeds. The reason is that we have a bunch of workarounds for misbehaving feeds that also make it slower to work with feeds that do not misbehave. Changes: - Only start guessing duplicate episodes when no "proper" match is found - Only parse non-html as HTML for attributes that really need it - Do not log failed Long parsing when size is not specified - Try to parse dates with RFC822 first before falling back to workarounds for other formats I ran a benchmark with "Stuff you should know" (for which the workarounds are not needed) containing 2k episodes. Includes download of 8MB of feed XML (~5 seconds), debug build. Before: 44 seconds, after: 13 seconds ==> 3.4 times faster feed refresh
-rw-r--r--core/src/main/java/de/danoeh/antennapod/core/storage/DBTasks.java7
-rw-r--r--core/src/main/java/de/danoeh/antennapod/core/storage/FeedItemDuplicateGuesser.java2
-rw-r--r--parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Itunes.java2
-rw-r--r--parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Media.java10
-rw-r--r--parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Rss20.java5
-rw-r--r--parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/util/DateUtils.java25
6 files changed, 36 insertions, 15 deletions
diff --git a/core/src/main/java/de/danoeh/antennapod/core/storage/DBTasks.java b/core/src/main/java/de/danoeh/antennapod/core/storage/DBTasks.java
index 1fb0991dd..883aefee8 100644
--- a/core/src/main/java/de/danoeh/antennapod/core/storage/DBTasks.java
+++ b/core/src/main/java/de/danoeh/antennapod/core/storage/DBTasks.java
@@ -171,6 +171,13 @@ public final class DBTasks {
* This is to work around podcasters breaking their GUIDs.
*/
private static FeedItem searchFeedItemGuessDuplicate(List<FeedItem> items, FeedItem searchItem) {
+ // First, see if it is a well-behaving feed that contains an item with the same identifier
+ for (FeedItem item : items) {
+ if (FeedItemDuplicateGuesser.sameAndNotEmpty(item.getItemIdentifier(), searchItem.getItemIdentifier())) {
+ return item;
+ }
+ }
+ // Not found yet, start more expensive guessing
for (FeedItem item : items) {
if (FeedItemDuplicateGuesser.seemDuplicates(item, searchItem)) {
return item;
diff --git a/core/src/main/java/de/danoeh/antennapod/core/storage/FeedItemDuplicateGuesser.java b/core/src/main/java/de/danoeh/antennapod/core/storage/FeedItemDuplicateGuesser.java
index 88f896187..1eb8b0577 100644
--- a/core/src/main/java/de/danoeh/antennapod/core/storage/FeedItemDuplicateGuesser.java
+++ b/core/src/main/java/de/danoeh/antennapod/core/storage/FeedItemDuplicateGuesser.java
@@ -31,7 +31,7 @@ public class FeedItemDuplicateGuesser {
&& mimeTypeLooksSimilar(media1, media2);
}
- private static boolean sameAndNotEmpty(String string1, String string2) {
+ public static boolean sameAndNotEmpty(String string1, String string2) {
if (TextUtils.isEmpty(string1) || TextUtils.isEmpty(string2)) {
return false;
}
diff --git a/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Itunes.java b/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Itunes.java
index cb4bcb8a7..e76b678f1 100644
--- a/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Itunes.java
+++ b/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Itunes.java
@@ -51,12 +51,12 @@ public class Itunes extends Namespace {
}
String content = state.getContentBuf().toString();
- String contentFromHtml = HtmlCompat.fromHtml(content, HtmlCompat.FROM_HTML_MODE_COMPACT).toString();
if (TextUtils.isEmpty(content)) {
return;
}
if (AUTHOR.equals(localName) && state.getFeed() != null && state.getTagstack().size() <= 3) {
+ String contentFromHtml = HtmlCompat.fromHtml(content, HtmlCompat.FROM_HTML_MODE_COMPACT).toString();
state.getFeed().setAuthor(contentFromHtml);
} else if (DURATION.equals(localName)) {
try {
diff --git a/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Media.java b/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Media.java
index 85cafea84..9336423c6 100644
--- a/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Media.java
+++ b/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Media.java
@@ -73,10 +73,12 @@ public class Media extends Namespace {
&& url != null && validTypeMedia) {
long size = 0;
String sizeStr = attributes.getValue(SIZE);
- try {
- size = Long.parseLong(sizeStr);
- } catch (NumberFormatException e) {
- Log.e(TAG, "Size \"" + sizeStr + "\" could not be parsed.");
+ if (!TextUtils.isEmpty(sizeStr)) {
+ try {
+ size = Long.parseLong(sizeStr);
+ } catch (NumberFormatException e) {
+ Log.e(TAG, "Size \"" + sizeStr + "\" could not be parsed.");
+ }
}
int durationMs = 0;
diff --git a/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Rss20.java b/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Rss20.java
index b19500895..fd5e91e9d 100644
--- a/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Rss20.java
+++ b/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/namespace/Rss20.java
@@ -92,7 +92,6 @@ public class Rss20 extends Namespace {
} else if (state.getTagstack().size() >= 2 && state.getContentBuf() != null) {
String contentRaw = state.getContentBuf().toString();
String content = SyndStringUtils.trimAllWhitespace(contentRaw);
- String contentFromHtml = HtmlCompat.fromHtml(content, HtmlCompat.FROM_HTML_MODE_COMPACT).toString();
SyndElement topElement = state.getTagstack().peek();
String top = topElement.getName();
SyndElement secondElement = state.getSecondTag();
@@ -108,6 +107,8 @@ public class Rss20 extends Namespace {
state.getCurrentItem().setItemIdentifier(contentRaw);
}
} else if (TITLE.equals(top)) {
+ // Calling fromHtml only if needed because it is slow for huge feeds
+ String contentFromHtml = HtmlCompat.fromHtml(content, HtmlCompat.FROM_HTML_MODE_COMPACT).toString();
if (ITEM.equals(second) && state.getCurrentItem() != null) {
state.getCurrentItem().setTitle(contentFromHtml);
} else if (CHANNEL.equals(second) && state.getFeed() != null) {
@@ -128,6 +129,8 @@ public class Rss20 extends Namespace {
}
} else if (DESCR.equals(localName)) {
if (CHANNEL.equals(second) && state.getFeed() != null) {
+ // Calling fromHtml only if needed because it is slow for huge feeds
+ String contentFromHtml = HtmlCompat.fromHtml(content, HtmlCompat.FROM_HTML_MODE_COMPACT).toString();
state.getFeed().setDescription(contentFromHtml);
} else if (ITEM.equals(second) && state.getCurrentItem() != null) {
state.getCurrentItem().setDescriptionIfLonger(content); // fromHtml here breaks \n when not html
diff --git a/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/util/DateUtils.java b/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/util/DateUtils.java
index 9b7f48769..4d9f68905 100644
--- a/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/util/DateUtils.java
+++ b/parser/feed/src/main/java/de/danoeh/antennapod/parser/feed/util/DateUtils.java
@@ -5,6 +5,7 @@ import android.util.Log;
import androidx.annotation.Nullable;
import org.apache.commons.lang3.StringUtils;
+import java.text.ParseException;
import java.text.ParsePosition;
import java.text.SimpleDateFormat;
import java.util.Date;
@@ -14,19 +15,27 @@ import java.util.TimeZone;
/**
* Parses several date formats.
*/
-public class DateUtils {
-
- private DateUtils() {
-
- }
-
+public abstract class DateUtils {
private static final String TAG = "DateUtils";
- private static final TimeZone defaultTimezone = TimeZone.getTimeZone("GMT");
+ private static final TimeZone TIME_ZONE_GMT = TimeZone.getTimeZone("GMT");
+ private static final ThreadLocal<SimpleDateFormat> RFC822_DATE_FORMAT = new ThreadLocal<>() {
+ @Override
+ protected SimpleDateFormat initialValue() {
+ SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss Z", Locale.US);
+ dateFormat.setTimeZone(TIME_ZONE_GMT);
+ return dateFormat;
+ }
+ };
public static Date parse(final String input) {
if (input == null) {
throw new IllegalArgumentException("Date must not be null");
}
+ try {
+ return RFC822_DATE_FORMAT.get().parse(input);
+ } catch (ParseException ignored) {
+ // Feed not following the specification? Now start all our expensive workarounds.
+ }
String date = input.trim().replace('/', '-').replaceAll("( ){2,}+", " ");
// remove colon from timezone to avoid differences between Android and Java SimpleDateFormat
@@ -97,7 +106,7 @@ public class DateUtils {
SimpleDateFormat parser = new SimpleDateFormat("", Locale.US);
parser.setLenient(false);
- parser.setTimeZone(defaultTimezone);
+ parser.setTimeZone(TIME_ZONE_GMT);
ParsePosition pos = new ParsePosition(0);
for (String pattern : patterns) {