summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTony Tam <149837+tonytamsf@users.noreply.github.com>2024-02-05 04:42:59 +0800
committerByteHamster <info@bytehamster.com>2024-05-18 18:58:01 +0200
commit7c4f19c9798b7c2c0c059a17fdfe843731cba5b4 (patch)
tree3bca1ec2aa4fba086fb1ef63e5fbab4111254b23
parent27e9bf36b1696ea2f35cc342964e9659087a5948 (diff)
downloadAntennaPod-7c4f19c9798b7c2c0c059a17fdfe843731cba5b4.zip
Transcript semantic parsing (#6852)
-rw-r--r--model/src/main/java/de/danoeh/antennapod/model/feed/FeedItem.java9
-rw-r--r--model/src/main/java/de/danoeh/antennapod/model/feed/Transcript.java28
-rw-r--r--model/src/main/java/de/danoeh/antennapod/model/feed/TranscriptSegment.java31
-rw-r--r--parser/transcript/README.md3
-rw-r--r--parser/transcript/build.gradle23
-rw-r--r--parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/JsonTranscriptParser.java65
-rw-r--r--parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/SrtTranscriptParser.java118
-rw-r--r--parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/TranscriptParser.java24
-rw-r--r--parser/transcript/src/test/java/de/danoeh/antennapod/parser/transcript/JsonTranscriptParserTest.java84
-rw-r--r--parser/transcript/src/test/java/de/danoeh/antennapod/parser/transcript/SrtTranscriptParserTest.java93
-rw-r--r--settings.gradle1
11 files changed, 479 insertions, 0 deletions
diff --git a/model/src/main/java/de/danoeh/antennapod/model/feed/FeedItem.java b/model/src/main/java/de/danoeh/antennapod/model/feed/FeedItem.java
index 0f5a3f4bb..1e623fd8e 100644
--- a/model/src/main/java/de/danoeh/antennapod/model/feed/FeedItem.java
+++ b/model/src/main/java/de/danoeh/antennapod/model/feed/FeedItem.java
@@ -46,6 +46,7 @@ public class FeedItem implements Serializable {
private String podcastIndexTranscriptUrl;
private String podcastIndexTranscriptType;
private String podcastIndexTranscriptText;
+ private Transcript transcript;
private int state;
public static final int NEW = -1;
@@ -463,6 +464,14 @@ public class FeedItem implements Serializable {
}
}
+ public Transcript getTranscript() {
+ return transcript;
+ }
+
+ public void setTranscript(Transcript t) {
+ transcript = t;
+ }
+
public String getPodcastIndexTranscriptText() {
return podcastIndexTranscriptText;
}
diff --git a/model/src/main/java/de/danoeh/antennapod/model/feed/Transcript.java b/model/src/main/java/de/danoeh/antennapod/model/feed/Transcript.java
new file mode 100644
index 000000000..da01c0e58
--- /dev/null
+++ b/model/src/main/java/de/danoeh/antennapod/model/feed/Transcript.java
@@ -0,0 +1,28 @@
+package de.danoeh.antennapod.model.feed;
+
+import java.util.Map;
+import java.util.TreeMap;
+
+public class Transcript {
+
+ private final TreeMap<Long, TranscriptSegment> segmentsMap = new TreeMap<>();
+
+ public void addSegment(TranscriptSegment segment) {
+ segmentsMap.put(segment.getStartTime(), segment);
+ }
+
+ public TranscriptSegment getSegmentAtTime(long time) {
+ if (segmentsMap.floorEntry(time) == null) {
+ return null;
+ }
+ return segmentsMap.floorEntry(time).getValue();
+ }
+
+ public int getSegmentCount() {
+ return segmentsMap.size();
+ }
+
+ public Map.Entry<Long, TranscriptSegment> getEntryAfterTime(long time) {
+ return segmentsMap.ceilingEntry(time);
+ }
+}
diff --git a/model/src/main/java/de/danoeh/antennapod/model/feed/TranscriptSegment.java b/model/src/main/java/de/danoeh/antennapod/model/feed/TranscriptSegment.java
new file mode 100644
index 000000000..0101bb8ed
--- /dev/null
+++ b/model/src/main/java/de/danoeh/antennapod/model/feed/TranscriptSegment.java
@@ -0,0 +1,31 @@
+package de.danoeh.antennapod.model.feed;
+
+public class TranscriptSegment {
+ private final long startTime;
+ private final long endTime;
+ private final String words;
+ private final String speaker;
+
+ public TranscriptSegment(long start, long end, String w, String s) {
+ startTime = start;
+ endTime = end;
+ words = w;
+ speaker = s;
+ }
+
+ public long getStartTime() {
+ return startTime;
+ }
+
+ public long getEndTime() {
+ return endTime;
+ }
+
+ public String getWords() {
+ return words;
+ }
+
+ public String getSpeaker() {
+ return speaker;
+ }
+} \ No newline at end of file
diff --git a/parser/transcript/README.md b/parser/transcript/README.md
new file mode 100644
index 000000000..a6ca61612
--- /dev/null
+++ b/parser/transcript/README.md
@@ -0,0 +1,3 @@
+# :parser:transcript
+
+This module provides parsing for transcripts
diff --git a/parser/transcript/build.gradle b/parser/transcript/build.gradle
new file mode 100644
index 000000000..122c74025
--- /dev/null
+++ b/parser/transcript/build.gradle
@@ -0,0 +1,23 @@
+plugins {
+ id("com.android.library")
+}
+apply from: "../../common.gradle"
+
+android {
+ namespace "de.danoeh.antennapod.parser.transcript"
+}
+
+dependencies {
+ implementation project(':model')
+
+ annotationProcessor "androidx.annotation:annotation:$annotationVersion"
+
+ implementation "androidx.core:core:$coreVersion"
+
+ implementation "org.apache.commons:commons-lang3:$commonslangVersion"
+ implementation "commons-io:commons-io:$commonsioVersion"
+ implementation "org.jsoup:jsoup:$jsoupVersion"
+
+ testImplementation "junit:junit:$junitVersion"
+ testImplementation "org.robolectric:robolectric:$robolectricVersion"
+}
diff --git a/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/JsonTranscriptParser.java b/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/JsonTranscriptParser.java
new file mode 100644
index 000000000..78f3bf9c8
--- /dev/null
+++ b/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/JsonTranscriptParser.java
@@ -0,0 +1,65 @@
+package de.danoeh.antennapod.parser.transcript;
+
+import org.apache.commons.lang3.StringUtils;
+import org.json.JSONArray;
+import org.json.JSONObject;
+import org.jsoup.internal.StringUtil;
+
+import de.danoeh.antennapod.model.feed.Transcript;
+import de.danoeh.antennapod.model.feed.TranscriptSegment;
+
+public class JsonTranscriptParser {
+ public static Transcript parse(String jsonStr) {
+ try {
+ Transcript transcript = new Transcript();
+ long startTime = -1L;
+ long endTime = -1L;
+ long segmentStartTime = -1L;
+ long duration = 0L;
+ String speaker = "";
+ String segmentBody = "";
+ JSONObject obj = new JSONObject(jsonStr);
+ JSONArray objSegments = obj.getJSONArray("segments");
+
+ for (int i = 0; i < objSegments.length(); i++) {
+ JSONObject jsonObject = objSegments.getJSONObject(i);
+ startTime = Double.valueOf(jsonObject.optDouble("startTime", -1) * 1000L).longValue();
+ endTime = Double.valueOf(jsonObject.optDouble("endTime", -1) * 1000L).longValue();
+ if (startTime < 0 || endTime < 0) {
+ continue;
+ }
+ if (segmentStartTime == -1L) {
+ segmentStartTime = startTime;
+ }
+ duration += endTime - startTime;
+
+ speaker = jsonObject.optString("speaker");
+ String body = jsonObject.optString("body");
+ segmentBody += body + " ";
+
+ if (duration >= TranscriptParser.MIN_SPAN) {
+ segmentBody = StringUtils.trim(segmentBody);
+ transcript.addSegment(new TranscriptSegment(segmentStartTime, endTime, segmentBody, speaker));
+ duration = 0L;
+ segmentBody = "";
+ segmentStartTime = -1L;
+ }
+ }
+
+ if (!StringUtil.isBlank(segmentBody)) {
+ segmentBody = StringUtils.trim(segmentBody);
+ transcript.addSegment(new TranscriptSegment(segmentStartTime, endTime, segmentBody, speaker));
+ }
+
+ if (transcript.getSegmentCount() > 0) {
+ return transcript;
+ } else {
+ return null;
+ }
+
+ } catch (org.json.JSONException e) {
+ e.printStackTrace();
+ }
+ return null;
+ }
+}
diff --git a/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/SrtTranscriptParser.java b/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/SrtTranscriptParser.java
new file mode 100644
index 000000000..098dadd99
--- /dev/null
+++ b/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/SrtTranscriptParser.java
@@ -0,0 +1,118 @@
+package de.danoeh.antennapod.parser.transcript;
+
+import org.apache.commons.lang3.StringUtils;
+import org.jsoup.internal.StringUtil;
+
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import de.danoeh.antennapod.model.feed.Transcript;
+import de.danoeh.antennapod.model.feed.TranscriptSegment;
+
+public class SrtTranscriptParser {
+ private static final Pattern TIMECODE_PATTERN = Pattern.compile("^([0-9]{2}):([0-9]{2}):([0-9]{2}),([0-9]{3})$");
+
+ public static Transcript parse(String str) {
+ if (StringUtils.isBlank(str)) {
+ return null;
+ }
+ str = str.replaceAll("\r\n", "\n");
+
+ Transcript transcript = new Transcript();
+ List<String> lines = Arrays.asList(str.split("\n"));
+ Iterator<String> iter = lines.iterator();
+ String speaker = "";
+ StringBuilder body = new StringBuilder();
+ String line;
+ String segmentBody = "";
+ long startTimecode = -1L;
+ long spanStartTimecode = -1L;
+ long endTimecode = -1L;
+ long duration = 0L;
+
+ while (iter.hasNext()) {
+ line = iter.next();
+
+ if (line.isEmpty()) {
+ continue;
+ }
+
+ if (line.contains("-->")) {
+ String[] timecodes = line.split("-->");
+ if (timecodes.length < 2) {
+ continue;
+ }
+ startTimecode = parseTimecode(timecodes[0].trim());
+ endTimecode = parseTimecode(timecodes[1].trim());
+ if (startTimecode == -1 || endTimecode == -1) {
+ continue;
+ }
+
+ if (spanStartTimecode == -1) {
+ spanStartTimecode = startTimecode;
+ }
+ duration += endTimecode - startTimecode;
+ do {
+ line = iter.next();
+ if (StringUtil.isBlank(line)) {
+ break;
+ }
+ body.append(line.strip());
+ body.append(" ");
+ } while (iter.hasNext());
+ }
+
+ if (body.indexOf(":") != -1) {
+ String [] parts = body.toString().trim().split(":");
+ if (parts.length < 2) {
+ continue;
+ }
+ speaker = parts[0];
+ body = new StringBuilder(parts[1].strip());
+ }
+ if (!StringUtil.isBlank(body.toString())) {
+ segmentBody += " " + body;
+ segmentBody = StringUtils.trim(segmentBody);
+ if (duration >= TranscriptParser.MIN_SPAN && endTimecode > spanStartTimecode) {
+ transcript.addSegment(new TranscriptSegment(spanStartTimecode,
+ endTimecode,
+ segmentBody,
+ speaker));
+ duration = 0L;
+ spanStartTimecode = -1L;
+ segmentBody = "";
+ }
+ body = new StringBuilder();
+ }
+ }
+
+ if (!StringUtil.isBlank(segmentBody) && endTimecode > spanStartTimecode) {
+ segmentBody = StringUtils.trim(segmentBody);
+ transcript.addSegment(new TranscriptSegment(spanStartTimecode,
+ endTimecode,
+ segmentBody,
+ speaker));
+ }
+ if (transcript.getSegmentCount() > 0) {
+ return transcript;
+ } else {
+ return null;
+ }
+ }
+
+ // Time format 00:00:00,000
+ static long parseTimecode(String timecode) {
+ Matcher matcher = TIMECODE_PATTERN.matcher(timecode);
+ if (!matcher.matches()) {
+ return -1;
+ }
+ long hours = Integer.parseInt(matcher.group(1));
+ long minutes = Integer.parseInt(matcher.group(2));
+ long seconds = Integer.parseInt(matcher.group(3));
+ long milliseconds = Integer.parseInt(matcher.group(4));
+ return (hours * 60 * 60 * 1000) + (minutes * 60 * 1000) + (seconds * 1000) + milliseconds;
+ }
+}
diff --git a/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/TranscriptParser.java b/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/TranscriptParser.java
new file mode 100644
index 000000000..0a4025d96
--- /dev/null
+++ b/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/TranscriptParser.java
@@ -0,0 +1,24 @@
+package de.danoeh.antennapod.parser.transcript;
+
+import org.apache.commons.lang3.StringUtils;
+
+import de.danoeh.antennapod.model.feed.Transcript;
+
+public class TranscriptParser {
+ static final long MIN_SPAN = 1000L; // merge short segments together to form a span of 1 second
+
+ public static Transcript parse(String str, String type) {
+ if (str == null || StringUtils.isBlank(str)) {
+ return null;
+ }
+
+ if ("application/json".equals(type)) {
+ return JsonTranscriptParser.parse(str);
+ }
+
+ if ("application/srt".equals(type) || "application/srr".equals(type) || "application/x-subrip".equals(type)) {
+ return SrtTranscriptParser.parse(str);
+ }
+ return null;
+ }
+}
diff --git a/parser/transcript/src/test/java/de/danoeh/antennapod/parser/transcript/JsonTranscriptParserTest.java b/parser/transcript/src/test/java/de/danoeh/antennapod/parser/transcript/JsonTranscriptParserTest.java
new file mode 100644
index 000000000..48996f492
--- /dev/null
+++ b/parser/transcript/src/test/java/de/danoeh/antennapod/parser/transcript/JsonTranscriptParserTest.java
@@ -0,0 +1,84 @@
+package de.danoeh.antennapod.parser.transcript;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.robolectric.RobolectricTestRunner;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+import de.danoeh.antennapod.model.feed.Transcript;
+
+@RunWith(RobolectricTestRunner.class)
+public class JsonTranscriptParserTest {
+ private static String jsonStr = "{'version': '1.0.0', "
+ + "'segments': [ "
+ + "{ 'speaker' : 'John Doe', 'startTime': 0.8, 'endTime': 1.9, 'body': 'And' },"
+ + "{ 'speaker' : 'Sally Green', 'startTime': 1.91, 'endTime': 2.8, 'body': 'this merges' },"
+ + "{ 'startTime': 2.9, 'endTime': 3.4, 'body': 'the' },"
+ + "{ 'startTime': 3.5, 'endTime': 3.6, 'body': 'person' }]}";
+
+ @Test
+ public void testParseJson() {
+ Transcript result = JsonTranscriptParser.parse(jsonStr);
+
+ assertEquals(result.getSegmentAtTime(0L), null);
+ assertEquals(result.getSegmentAtTime(800L).getSpeaker(), "John Doe");
+ assertEquals(result.getSegmentAtTime(800L).getStartTime(), 800L);
+ assertEquals(result.getSegmentAtTime(800L).getEndTime(), 1900L);
+ assertEquals(1910L, (long) result.getEntryAfterTime(1800L).getKey());
+ // 2 segments get merged into at least 1 second
+ assertEquals("this merges the", result.getEntryAfterTime(1800L).getValue().getWords());
+ }
+
+ @Test
+ public void testParse() {
+ String type = "application/json";
+ Transcript result = TranscriptParser.parse(jsonStr, type);
+ // There isn't a segment at 900L, so go backwards and get the segment at 800L
+ assertEquals(result.getSegmentAtTime(900L).getSpeaker(), "John Doe");
+ assertEquals(result.getSegmentAtTime(930L).getWords(), "And");
+
+ // blank string
+ String blankStr = "";
+ result = TranscriptParser.parse(blankStr, type);
+ assertEquals(result, null);
+
+ result = TranscriptParser.parse(null, type);
+ assertEquals(result, null);
+
+ // All blank lines
+ String allNewlinesStr = "\r\n\r\n\r\n\r\n";
+ result = TranscriptParser.parse(allNewlinesStr, type);
+ assertEquals(result, null);
+
+ // segments is missing
+ String jsonStrBad1 = "{'version': '1.0.0', "
+ + "'segmentsX': [ "
+ + "{ 'speaker' : 'John Doe', 'startTime': 0.8, 'endTime': 1.9, 'body': 'And' },"
+ + "{ 'startTime': 2.9, 'endTime': 3.4, 'body': 'the' },"
+ + "{ 'startTime': 3.5, 'endTime': 3.6, 'body': 'person' }]}";
+ result = TranscriptParser.parse(jsonStrBad1, type);
+ assertEquals(result, null);
+
+ // invalid time formatting
+ String jsonStrBad2 = "{'version': '1.0.0', "
+ + "'segments': [ "
+ + "{ 'speaker' : 'XJohn Doe', 'startTime': stringTime, 'endTime': stringTime, 'body': 'And' },"
+ + "{ 'XstartTime': 2.9, 'XendTime': 3.4, 'body': 'the' },"
+ + "{ 'startTime': '-2.9', 'endTime': '-3.4', 'body': 'the' },"
+ + "{ 'startTime': 'bad_time', 'endTime': '-3.4', 'body': 'the' }]}";
+ result = TranscriptParser.parse(jsonStrBad2, type);
+ assertNull(result);
+
+ // Just plain text
+ String strBad3 = "John Doe: Promoting your podcast in a new\n\n"
+ + "way. The latest from PogNews.";
+ result = TranscriptParser.parse(strBad3, type);
+ assertNull(result);
+
+ // passing the wrong type
+ type = "application/srt";
+ result = TranscriptParser.parse(jsonStr, type);
+ assertEquals(result, null);
+ }
+}
diff --git a/parser/transcript/src/test/java/de/danoeh/antennapod/parser/transcript/SrtTranscriptParserTest.java b/parser/transcript/src/test/java/de/danoeh/antennapod/parser/transcript/SrtTranscriptParserTest.java
new file mode 100644
index 000000000..f7854c5bf
--- /dev/null
+++ b/parser/transcript/src/test/java/de/danoeh/antennapod/parser/transcript/SrtTranscriptParserTest.java
@@ -0,0 +1,93 @@
+package de.danoeh.antennapod.parser.transcript;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.robolectric.RobolectricTestRunner;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import de.danoeh.antennapod.model.feed.Transcript;
+
+@RunWith(RobolectricTestRunner.class)
+public class SrtTranscriptParserTest {
+ private static String srtStr = "1\n"
+ + "00:00:00,000 --> 00:00:02,730\n"
+ + "John Doe: Promoting your podcast in a new\n\n"
+ + "2\n"
+ + "00:00:02,730 --> 00:00:04,600\n"
+ + "way. The latest from PogNews.\n\n"
+ + "00:00:04,730 --> 00:00:05,600\n"
+ + "way. The latest from PogNews.";
+
+ @Test
+ public void testParseSrt() {
+ Transcript result = SrtTranscriptParser.parse(srtStr);
+
+ assertEquals(result.getSegmentAtTime(0L).getWords(), "Promoting your podcast in a new");
+ assertEquals(result.getSegmentAtTime(0L).getSpeaker(), "John Doe");
+ assertEquals(result.getSegmentAtTime(0L).getStartTime(), 0L);
+ assertEquals(result.getSegmentAtTime(0L).getEndTime(), 2730L);
+ assertEquals((long) result.getEntryAfterTime(1000L).getKey(), 2730L);
+ assertEquals(result.getEntryAfterTime(1000L).getValue().getWords(), "way. The latest from PogNews.");
+ }
+
+ @Test
+ public void testParse() {
+ String type = "application/srr";
+ Transcript result;
+
+ result = TranscriptParser.parse(srtStr, type);
+ // There isn't a segment at 800L, so go backwards and get the segment at 0L
+ assertEquals(result.getSegmentAtTime(800L).getWords(), "Promoting your podcast in a new");
+
+ result = TranscriptParser.parse(null, type);
+ assertEquals(result, null);
+
+ // blank string
+ String blankStr = "";
+ result = TranscriptParser.parse(blankStr, type);
+ assertNull(result);
+
+ // All empty lines
+ String allNewlinesStr = "\r\n\r\n\r\n\r\n";
+ result = TranscriptParser.parse(allNewlinesStr, type);
+ assertEquals(result, null);
+
+ // first segment has invalid time formatting, so the entire segment will be thrown out
+ String srtStrBad1 = "00:0000,000 --> 00:00:02,730\n"
+ + "John Doe: Promoting your podcast in a new\n\n"
+ + "2\n"
+ + "00:00:02,730 --> 00:00:04,600\n"
+ + "way. The latest from PogNews.";
+ result = TranscriptParser.parse(srtStrBad1, type);
+ assertEquals(result.getSegmentAtTime(2730L).getWords(), "way. The latest from PogNews.");
+
+ // first segment has invalid time in end time, 2nd segment has invalid time in both start time and end time
+ String srtStrBad2 = "00:00:00,000 --> 00:0002,730\n"
+ + "Jane Doe: Promoting your podcast in a new\n\n"
+ + "2\n"
+ + "badstarttime --> badendtime\n"
+ + "way. The latest from PogNews.\n"
+ + "badstarttime -->\n"
+ + "Jane Doe says something\n"
+ + "00:00:00,000 --> 00:00:02,730\n"
+ + "Jane Doe:";
+ result = TranscriptParser.parse(srtStrBad2, type);
+ assertNull(result);
+
+ // Just plain text
+ String strBad3 = "John Doe: Promoting your podcast in a new\n\n"
+ + "way. The latest from PogNews.";
+ result = TranscriptParser.parse(strBad3, type);
+ assertNull(result);
+
+ // passing the wrong type
+ type = "application/json";
+ result = TranscriptParser.parse(srtStr, type);
+ assertEquals(result, null);
+
+ type = "unknown";
+ result = TranscriptParser.parse(srtStr, type);
+ assertEquals(result, null);
+ }
+}
+
diff --git a/settings.gradle b/settings.gradle
index 8cf8baf3e..3b3df7ba8 100644
--- a/settings.gradle
+++ b/settings.gradle
@@ -30,6 +30,7 @@ include ':net:sync:service'
include ':parser:feed'
include ':parser:media'
+include ':parser:transcript'
include ':playback:base'
include ':playback:cast'