summaryrefslogtreecommitdiff
path: root/AK/FuzzyMatch.cpp
blob: f82db25af5a576b02ede00f121fb6f40acfdc7e2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
/*
 * Copyright (c) 2021, Spencer Dixon <spencercdixon@gmail.com>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#include <AK/CharacterTypes.h>
#include <AK/FuzzyMatch.h>
#include <string.h>

namespace AK {

static constexpr int const RECURSION_LIMIT = 10;
static constexpr int const MAX_MATCHES = 256;

// Bonuses and penalties are used to build up a final score for the match.
static constexpr int const SEQUENTIAL_BONUS = 15;            // bonus for adjacent matches (needle: 'ca', haystack: 'cat')
static constexpr int const SEPARATOR_BONUS = 30;             // bonus if match occurs after a separator ('_' or ' ')
static constexpr int const CAMEL_BONUS = 30;                 // bonus if match is uppercase and prev is lower (needle: 'myF' haystack: '/path/to/myFile.txt')
static constexpr int const FIRST_LETTER_BONUS = 20;          // bonus if the first letter is matched (needle: 'c' haystack: 'cat')
static constexpr int const LEADING_LETTER_PENALTY = -5;      // penalty applied for every letter in str before the first match
static constexpr int const MAX_LEADING_LETTER_PENALTY = -15; // maximum penalty for leading letters
static constexpr int const UNMATCHED_LETTER_PENALTY = -1;    // penalty for every letter that doesn't matter

static int calculate_score(StringView string, u8* index_points, size_t index_points_size)
{
    int out_score = 100;

    int penalty = LEADING_LETTER_PENALTY * index_points[0];
    if (penalty < MAX_LEADING_LETTER_PENALTY)
        penalty = MAX_LEADING_LETTER_PENALTY;
    out_score += penalty;

    int unmatched = string.length() - index_points_size;
    out_score += UNMATCHED_LETTER_PENALTY * unmatched;

    for (size_t i = 0; i < index_points_size; i++) {
        u8 current_idx = index_points[i];

        if (current_idx == 0)
            out_score += FIRST_LETTER_BONUS;

        if (i == 0)
            continue;

        u8 previous_idx = index_points[i - 1];
        if (current_idx - 1 == previous_idx)
            out_score += SEQUENTIAL_BONUS;

        u32 current_character = string[current_idx];
        u32 neighbor_character = string[current_idx - 1];

        if (neighbor_character != to_ascii_uppercase(neighbor_character) && current_character != to_ascii_lowercase(current_character))
            out_score += CAMEL_BONUS;

        if (neighbor_character == '_' || neighbor_character == ' ')
            out_score += SEPARATOR_BONUS;
    }

    return out_score;
}

static FuzzyMatchResult fuzzy_match_recursive(StringView needle, StringView haystack, size_t needle_idx, size_t haystack_idx,
    u8 const* src_matches, u8* matches, int next_match, int& recursion_count)
{
    int out_score = 0;

    ++recursion_count;
    if (recursion_count >= RECURSION_LIMIT)
        return { false, out_score };

    if (needle.length() == needle_idx || haystack.length() == haystack_idx)
        return { false, out_score };

    bool had_recursive_match = false;
    constexpr size_t recursive_match_limit = 256;
    u8 best_recursive_matches[recursive_match_limit];
    int best_recursive_score = 0;

    bool first_match = true;
    while (needle_idx < needle.length() && haystack_idx < haystack.length()) {

        if (to_ascii_lowercase(needle[needle_idx]) == to_ascii_lowercase(haystack[haystack_idx])) {
            if (next_match >= MAX_MATCHES)
                return { false, out_score };

            if (first_match && src_matches) {
                memcpy(matches, src_matches, next_match);
                first_match = false;
            }

            u8 recursive_matches[recursive_match_limit] {};
            auto result = fuzzy_match_recursive(needle, haystack, needle_idx, haystack_idx + 1, matches, recursive_matches, next_match, recursion_count);
            if (result.matched) {
                if (!had_recursive_match || result.score > best_recursive_score) {
                    memcpy(best_recursive_matches, recursive_matches, recursive_match_limit);
                    best_recursive_score = result.score;
                }
                had_recursive_match = true;
            }
            matches[next_match++] = haystack_idx;
            needle_idx++;
        }
        haystack_idx++;
    }

    bool matched = needle_idx == needle.length();
    if (!matched)
        return { false, out_score };

    out_score = calculate_score(haystack, matches, next_match);

    if (had_recursive_match && (best_recursive_score > out_score)) {
        memcpy(matches, best_recursive_matches, MAX_MATCHES);
        out_score = best_recursive_score;
    }

    return { true, out_score };
}

// This fuzzy_match algorithm is based off a similar algorithm used by Sublime Text. The key insight is that instead
// of doing a total in the distance between characters (I.E. Levenshtein Distance), we apply some meaningful heuristics
// related to our dataset that we're trying to match to build up a score. Scores can then be sorted and displayed
// with the highest at the top.
//
// Scores are not normalized between any values and have no particular meaning. The starting value is 100 and when we
// detect good indicators of a match we add to the score. When we detect bad indicators, we penalize the match and subtract
// from its score. Therefore, the longer the needle/haystack the greater the range of scores could be.
FuzzyMatchResult fuzzy_match(StringView needle, StringView haystack)
{
    int recursion_count = 0;
    u8 matches[MAX_MATCHES] {};
    return fuzzy_match_recursive(needle, haystack, 0, 0, nullptr, matches, 0, recursion_count);
}

}