Boyer–Moore string search is faster but not the same semantics

author: George Fraser <george@fivetran.com> 2018-12-29 01:15:04 -0800
committer: George Fraser <george@fivetran.com> 2018-12-29 01:15:04 -0800
commit: 42db035b1099267db8ad4c7f39e3991ff4aad697 (patch)
tree: d10256ce649d53ed1c4d25f0cc13f14f6825d72e /src
parent: f342aa14b13149cf061942584edd447f37edd3cd (diff)
download: java-language-server-42db035b1099267db8ad4c7f39e3991ff4aad697.zip
4 files changed, 196 insertions, 1 deletions
diff --git a/src/main/java/org/javacs/JavaCompilerService.java b/src/main/java/org/javacs/JavaCompilerService.java
index fd5abb5..06ac779 100644
--- a/src/main/java/org/javacs/JavaCompilerService.java
+++ b/src/main/java/org/javacs/JavaCompilerService.java
@@ -339,14 +339,16 @@ public class JavaCompilerService {
     }
 
     public List<TreePath> findSymbols(String query, int limit) {
-        LOG.info(String.format("Searching for `%s`", query));
+        LOG.info(String.format("Searching for `%s`...", query));
 
         var result = new ArrayList<TreePath>();
         var files = allJavaSources();
         for (var file : files) {
             if (!Parser.containsWordMatching(file, query)) continue;
+            LOG.info(String.format("...%s contains text matches", file.getFileName()));
             var parse = Parser.parse(file);
             var symbols = Parser.findSymbolsMatching(parse, query);
+            if (symbols.size() > 0) LOG.info(String.format("...found %d occurrences", symbols.size()));
             result.addAll(symbols);
             if (result.size() >= limit) break;
         }
diff --git a/src/main/java/org/javacs/Parser.java b/src/main/java/org/javacs/Parser.java
index 6418c74..27aa40e 100644
--- a/src/main/java/org/javacs/Parser.java
+++ b/src/main/java/org/javacs/Parser.java
@@ -111,6 +111,19 @@ class Parser {
         }
     }
 
+    static boolean containsText(Path java, String query) {
+        try {
+            var search = new StringSearch(query);
+            var reader = Files.newBufferedReader(java);
+            for (var line = reader.readLine(); line != null; line = reader.readLine()) {
+                if (search.next(line) != -1) return true;
+            }
+            return false;
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
     static List<TreePath> findSymbolsMatching(CompilationUnitTree parse, String query) {
         class Find extends TreePathScanner<Void, Void> {
             List<TreePath> found = new ArrayList<>();
diff --git a/src/main/java/org/javacs/StringSearch.java b/src/main/java/org/javacs/StringSearch.java
new file mode 100644
index 0000000..12d6f02
--- /dev/null
+++ b/src/main/java/org/javacs/StringSearch.java
@@ -0,0 +1,148 @@
+package org.javacs;
+
+// Translated from https://golang.org/src/strings/search.go
+
+// Search efficiently finds strings in a source text. It's implemented
+// using the Boyer-Moore string search algorithm:
+// https://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
+// https://www.cs.utexas.edu/~moore/publications/fstrpos.pdf (note: this aged
+// document uses 1-based indexing)
+class StringSearch {
+    // pattern is the string that we are searching for in the text.
+    private final byte[] pattern;
+
+    // badCharSkip[b] contains the distance between the last byte of pattern
+    // and the rightmost occurrence of b in pattern. If b is not in pattern,
+    // badCharSkip[b] is len(pattern).
+    //
+    // Whenever a mismatch is found with byte b in the text, we can safely
+    // shift the matching frame at least badCharSkip[b] until the next time
+    // the matching char could be in alignment.
+    // TODO 256 is not coloring
+    private final int[] badCharSkip = new int[256];
+
+    // goodSuffixSkip[i] defines how far we can shift the matching frame given
+    // that the suffix pattern[i+1:] matches, but the byte pattern[i] does
+    // not. There are two cases to consider:
+    //
+    // 1. The matched suffix occurs elsewhere in pattern (with a different
+    // byte preceding it that we might possibly match). In this case, we can
+    // shift the matching frame to align with the next suffix chunk. For
+    // example, the pattern "mississi" has the suffix "issi" next occurring
+    // (in right-to-left order) at index 1, so goodSuffixSkip[3] ==
+    // shift+len(suffix) == 3+4 == 7.
+    //
+    // 2. If the matched suffix does not occur elsewhere in pattern, then the
+    // matching frame may share part of its prefix with the end of the
+    // matching suffix. In this case, goodSuffixSkip[i] will contain how far
+    // to shift the frame to align this portion of the prefix to the
+    // suffix. For example, in the pattern "abcxxxabc", when the first
+    // mismatch from the back is found to be in position 3, the matching
+    // suffix "xxabc" is not found elsewhere in the pattern. However, its
+    // rightmost "abc" (at position 6) is a prefix of the whole pattern, so
+    // goodSuffixSkip[3] == shift+len(suffix) == 6+5 == 11.
+    private final int[] goodSuffixSkip;
+
+    StringSearch(String pattern) {
+        this(pattern.getBytes());
+    }
+
+    StringSearch(byte[] pattern) {
+        this.pattern = pattern;
+        this.goodSuffixSkip = new int[pattern.length];
+
+        // last is the index of the last character in the pattern.
+        var last = pattern.length - 1;
+
+        // Build bad character table.
+        // Bytes not in the pattern can skip one pattern's length.
+        for (var i = 0; i < badCharSkip.length; i++) {
+            badCharSkip[i] = pattern.length;
+        }
+        // The loop condition is < instead of <= so that the last byte does not
+        // have a zero distance to itself. Finding this byte out of place implies
+        // that it is not in the last position.
+        for (var i = 0; i < last; i++) {
+            badCharSkip[pattern[i] + 128] = last - i;
+        }
+
+        // Build good suffix table.
+        // First pass: set each value to the next index which starts a prefix of
+        // pattern.
+        var lastPrefix = last;
+        for (var i = last; i >= 0; i--) {
+            if (hasPrefix(pattern, new Slice(pattern, i + 1))) lastPrefix = i + 1;
+            // lastPrefix is the shift, and (last-i) is len(suffix).
+            goodSuffixSkip[i] = lastPrefix + last - i;
+        }
+        // Second pass: find repeats of pattern's suffix starting from the front.
+        for (var i = 0; i < last; i++) {
+            var lenSuffix = longestCommonSuffix(pattern, new Slice(pattern, 1, i + 1));
+            if (pattern[i - lenSuffix] != pattern[last - lenSuffix]) {
+                // (last-i) is the shift, and lenSuffix is len(suffix).
+                goodSuffixSkip[last - lenSuffix] = lenSuffix + last - i;
+            }
+        }
+    }
+
+    int next(String text) {
+        return next(text.getBytes());
+    }
+
+    int next(byte[] text) {
+        var i = pattern.length - 1;
+        while (i < text.length) {
+            // Compare backwards from the end until the first unmatching character.
+            var j = pattern.length - 1;
+            while (j >= 0 && text[i] == pattern[j]) {
+                i--;
+                j--;
+            }
+            if (j < 0) {
+                return i + 1; // match
+            }
+            i += Math.max(badCharSkip[text[i] + 128], goodSuffixSkip[j]);
+        }
+        return -1;
+    }
+
+    private boolean hasPrefix(byte[] s, Slice prefix) {
+        for (int i = 0; i < prefix.length(); i++) {
+            if (s[i] != prefix.get(i)) return false;
+        }
+        return true;
+    }
+
+    private int longestCommonSuffix(byte[] a, Slice b) {
+        int i = 0;
+        for (; i < a.length && i < b.length(); i++) {
+            if (a[a.length - 1 - i] != b.get(b.length() - 1 - i)) {
+                break;
+            }
+        }
+        return i;
+    }
+
+    private static class Slice {
+        private final byte[] target;
+        private int from, until;
+
+        int length() {
+            return until - from;
+        }
+
+        byte get(int i) {
+            return target[from + i];
+        }
+
+        public Slice(byte[] target, int from) {
+            this(target, from, target.length);
+        }
+
+        public Slice(byte[] target, int from, int until) {
+            this.target = target;
+            this.from = from;
+            this.until = until;
+        }
+    }
+}
diff --git a/src/test/java/org/javacs/StringSearchTest.java b/src/test/java/org/javacs/StringSearchTest.java
new file mode 100644
index 0000000..7827fcd
--- /dev/null
+++ b/src/test/java/org/javacs/StringSearchTest.java
@@ -0,0 +1,32 @@
+package org.javacs;
+
+import static org.hamcrest.Matchers.*;
+import static org.junit.Assert.*;
+
+import org.junit.Test;
+
+public class StringSearchTest {
+    private void testNext(String pat, String text, int index) {
+        var got = new StringSearch(pat).next(text);
+        assertThat(got, equalTo(index));
+    }
+
+    @Test
+    public void testFinderNext() {
+        testNext("", "", 0);
+        testNext("", "abc", 0);
+        testNext("abc", "", -1);
+        testNext("abc", "abc", 0);
+        testNext("d", "abcdefg", 3);
+        testNext("nan", "banana", 2);
+        testNext("pan", "anpanman", 2);
+        testNext("nnaaman", "anpanmanam", -1);
+        testNext("abcd", "abc", -1);
+        testNext("abcd", "bcd", -1);
+        testNext("bcd", "abcd", 1);
+        testNext("abc", "acca", -1);
+        testNext("aa", "aaa", 0);
+        testNext("baa", "aaaaa", -1);
+        testNext("at that", "which finally halts.  at that point", 22);
+    }
+}
author	George Fraser <george@fivetran.com>	2018-12-29 01:15:04 -0800
committer	George Fraser <george@fivetran.com>	2018-12-29 01:15:04 -0800
commit	42db035b1099267db8ad4c7f39e3991ff4aad697 (patch)
tree	d10256ce649d53ed1c4d25f0cc13f14f6825d72e /src
parent	f342aa14b13149cf061942584edd447f37edd3cd (diff)
download	java-language-server-42db035b1099267db8ad4c7f39e3991ff4aad697.zip