Fixes #3444 - Strip off HTML from podcast descriptions

author: fossterer <shashank@linux.com> 2019-10-24 23:20:31 -0400
committer: fossterer <shashank@linux.com> 2019-10-24 23:20:31 -0400
commit: ca83c5953727d4096ddb7af353575b23324dce80 (patch)
tree: 562983ca2746e3f94089ffcb68e66c8d30ec9ae6 /core/src
parent: 437f3f29c06e4a253e841fc959c768aec8d565a0 (diff)
download: AntennaPod-ca83c5953727d4096ddb7af353575b23324dce80.zip
1 files changed, 40 insertions, 0 deletions
diff --git a/core/src/main/java/de/danoeh/antennapod/core/util/syndication/HtmlToPlainText.java b/core/src/main/java/de/danoeh/antennapod/core/util/syndication/HtmlToPlainText.java
index 61072f1ad..3550f28c6 100644
--- a/core/src/main/java/de/danoeh/antennapod/core/util/syndication/HtmlToPlainText.java
+++ b/core/src/main/java/de/danoeh/antennapod/core/util/syndication/HtmlToPlainText.java
@@ -1,12 +1,19 @@
 package de.danoeh.antennapod.core.util.syndication;
 
+import android.text.TextUtils;
+
+import org.apache.commons.lang3.StringUtils;
+import org.jsoup.Jsoup;
 import org.jsoup.helper.StringUtil;
+import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.nodes.Node;
 import org.jsoup.nodes.TextNode;
 import org.jsoup.select.NodeTraversor;
 import org.jsoup.select.NodeVisitor;
 
+import java.util.regex.Pattern;
+
 /**
  * This class is based on <code>HtmlToPlainText</code> from jsoup's examples package.
  *
@@ -27,6 +34,39 @@ import org.jsoup.select.NodeVisitor;
 public class HtmlToPlainText {
 
     /**
+     * Use this method to strip off HTML encoding from given text
+     * <p>
+     * Replaces bullet points with *, ignores colors/bold/...
+     *
+     * @param str String with any encoding
+     * @return Human readable text with minimal HTML formatting
+     */
+    public static String getPlainText(String str) {
+        if (!TextUtils.isEmpty(str) && isHtml(str)) {
+            HtmlToPlainText formatter = new HtmlToPlainText();
+            Document feedDescription = Jsoup.parse(str);
+            str = StringUtils.trim(formatter.getPlainText(feedDescription));
+        } else if (TextUtils.isEmpty(str)) {
+            str = "";
+        }
+
+        return str;
+    }
+
+    /**
+     * Use this method to determine if a given text has any HTML tag
+     *
+     * @param str String to be tested for presence of HTML content
+     * @return <b>True</b> if text contains any HTML tags</br><b>False</b> is no HTML tag is found
+     */
+    private static boolean isHtml(String str) {
+        final String HTML_TAG_PATTERN = "<(\"[^\"]*\"|'[^']*'|[^'\">])*>";
+        Pattern htmlValidator = TextUtils.isEmpty(HTML_TAG_PATTERN) ? null : Pattern.compile(HTML_TAG_PATTERN);
+
+        return htmlValidator.matcher(str).find();
+    }
+
+    /**
      * Format an Element to plain-text
      * @param element the root element to format
      * @return formatted text
author	fossterer <shashank@linux.com>	2019-10-24 23:20:31 -0400
committer	fossterer <shashank@linux.com>	2019-10-24 23:20:31 -0400
commit	ca83c5953727d4096ddb7af353575b23324dce80 (patch)
tree	562983ca2746e3f94089ffcb68e66c8d30ec9ae6 /core/src
parent	437f3f29c06e4a253e841fc959c768aec8d565a0 (diff)
download	AntennaPod-ca83c5953727d4096ddb7af353575b23324dce80.zip