aboutsummaryrefslogtreecommitdiff
path: root/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringDistanceFunctions.java
blob: 0b027694450b18ac21e0dcd149b1940006d75cf8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.drill.exec.expr.fn.impl;

import org.apache.drill.exec.expr.DrillSimpleFunc;
import org.apache.drill.exec.expr.annotations.FunctionTemplate;
import org.apache.drill.exec.expr.annotations.Output;
import org.apache.drill.exec.expr.annotations.Param;
import org.apache.drill.exec.expr.annotations.Workspace;
import org.apache.drill.exec.expr.holders.Float8Holder;
import org.apache.drill.exec.expr.holders.VarCharHolder;

public class StringDistanceFunctions {
  static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(StringDistanceFunctions.class);

  private StringDistanceFunctions() {
  }

  /**
   * This function calculates the cosine distance between two strings.
   * Usage:  SELECT cosine_distance( string1, string2 ) AS cosine_distance FROM...
   */

  @FunctionTemplate(name = "cosine_distance", scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL)
  public static class CosineDistanceFunction implements DrillSimpleFunc {

    @Param
    VarCharHolder rawInput1;

    @Param
    VarCharHolder rawInput2;

    @Workspace
    org.apache.commons.text.similarity.CosineDistance d;

    @Output
    Float8Holder out;

    @Override
    public void setup() {
      d = new org.apache.commons.text.similarity.CosineDistance();
    }

    @Override
    public void eval() {

      String input1 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start, rawInput1.end, rawInput1.buffer);
      String input2 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start, rawInput2.end, rawInput2.buffer);


      double result = d.apply(input1, input2);
      out.value = result;
    }
  }

  /**
   * This function calculates the cosine distance between two strings.
   * A matching algorithm that is similar to the searching algorithms implemented in editors such
   * as Sublime Text, TextMate, Atom and others.
   * <p>
   * One point is given for every matched character. Subsequent matches yield two bonus points. A higher score
   * indicates a higher similarity.
   * <p>
   * <p>
   * Usage:  SELECT fuzzy_score( string1, string2 ) AS fuzzy_score FROM...
   */

  @FunctionTemplate(name = "fuzzy_score", scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL)
  public static class FuzzyScoreFunction implements DrillSimpleFunc {

    @Param
    VarCharHolder rawInput1;

    @Param
    VarCharHolder rawInput2;

    @Output
    Float8Holder out;

    @Workspace
    org.apache.commons.text.similarity.FuzzyScore d;

    @Override
    public void setup() {
      d = new org.apache.commons.text.similarity.FuzzyScore(java.util.Locale.ENGLISH);
    }

    @Override
    public void eval() {

      String input1 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start, rawInput1.end, rawInput1.buffer);
      String input2 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start, rawInput2.end, rawInput2.buffer);

      double result = d.fuzzyScore(input1, input2);
      out.value = result;
    }
  }

  /**
   * The hamming distance between two strings of equal length is the number of
   * positions at which the corresponding symbols are different.
   * <p>
   * For further explanation about the Hamming Distance, take a look at its
   * Wikipedia page at http://en.wikipedia.org/wiki/Hamming_distance.
   * <p>
   * Usage:  SELECT hamming_distance( string1, string2 ) FROM...
   */


  @FunctionTemplate(name = "hamming_distance", scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL)
  public static class HammingDistanceFunction implements DrillSimpleFunc {

    @Param
    VarCharHolder rawInput1;

    @Param
    VarCharHolder rawInput2;

    @Output
    Float8Holder out;

    @Workspace
    org.apache.commons.text.similarity.HammingDistance d;

    @Override
    public void setup() {
      d = new org.apache.commons.text.similarity.HammingDistance();
    }

    @Override
    public void eval() {

      String input1 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start, rawInput1.end, rawInput1.buffer);
      String input2 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start, rawInput2.end, rawInput2.buffer);

      double result = d.apply(input1, input2);
      out.value = result;
    }
  }


  /**
   * Measures the Jaccard distance of two sets of character sequence. Jaccard
   * distance is the dissimilarity between two sets. It is the complementary of
   * Jaccard similarity.
   * <p>
   * For further explanation about Jaccard Distance, refer
   * https://en.wikipedia.org/wiki/Jaccard_index
   * <p>
   * Usage:  SELECT jaccard_distance( string1, string2 ) FROM ...
   */


  @FunctionTemplate(name = "jaccard_distance", scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL)
  public static class JaccardDistanceFunction implements DrillSimpleFunc {

    @Param
    VarCharHolder rawInput1;

    @Param
    VarCharHolder rawInput2;

    @Output
    Float8Holder out;

    @Workspace
    org.apache.commons.text.similarity.JaccardDistance d;

    @Override
    public void setup() {
      d = new org.apache.commons.text.similarity.JaccardDistance();
    }

    @Override
    public void eval() {

      String input1 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start, rawInput1.end, rawInput1.buffer);
      String input2 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start, rawInput2.end, rawInput2.buffer);

      double result = d.apply(input1, input2);
      out.value = result;
    }
  }

  /**
   * A similarity algorithm indicating the percentage of matched characters between two character sequences.
   * <p>
   * The Jaro measure is the weighted sum of percentage of matched characters
   * from each file and transposed characters. Winkler increased this measure
   * for matching initial characters.
   * <p>
   * This implementation is based on the Jaro Winkler similarity algorithm
   * from https://en.wikipedia.org/wiki/Jaro–Winkler_distance
   * <p>
   * Usage: SELECT jaro_distance( string1, string2 ) FROM...
   */

  @FunctionTemplate(name = "jaro_distance", scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL)
  public static class JaroDistanceFunction implements DrillSimpleFunc {

    @Param
    VarCharHolder rawInput1;

    @Param
    VarCharHolder rawInput2;

    @Output
    Float8Holder out;

    @Workspace
    org.apache.commons.text.similarity.JaroWinklerDistance d;

    @Override
    public void setup() {
      d = new org.apache.commons.text.similarity.JaroWinklerDistance();
    }

    @Override
    public void eval() {

      String input1 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start, rawInput1.end, rawInput1.buffer);
      String input2 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start, rawInput2.end, rawInput2.buffer);

      double result = d.apply(input1, input2);
      out.value = result;
    }
  }

  /**
   * An algorithm for measuring the difference between two character sequences.
   * <p>
   * This is the number of changes needed to change one sequence into another,
   * where each change is a single character modification (deletion, insertion
   * or substitution).
   * <p>
   * Usage: SELECT levenshtein_distance( string1, string2 ) FROM...
   */

  @FunctionTemplate(name = "levenshtein_distance", scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL)
  public static class LevenstheinDistanceFunction implements DrillSimpleFunc {

    @Param
    VarCharHolder rawInput1;

    @Param
    VarCharHolder rawInput2;

    @Output
    Float8Holder out;

    @Workspace
    org.apache.commons.text.similarity.LevenshteinDistance d;

    @Override
    public void setup() {
      d = new org.apache.commons.text.similarity.LevenshteinDistance();
    }

    @Override
    public void eval() {

      String input1 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start, rawInput1.end, rawInput1.buffer);
      String input2 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start, rawInput2.end, rawInput2.buffer);

      double result = d.apply(input1, input2);
      out.value = result;
    }
  }

  /**
   * The Longest common subsequence algorithm returns the length of the longest subsequence that two strings have in common.
   * Two strings that are entirely different, return a value of 0, and two strings that return a value of the
   * commonly shared length implies that the strings are completely the same in value and position.
   * Note: Generally this algorithm is fairly inefficient, as for length m, n of the input
   * CharSequence's left and right respectively, the runtime of the algorithm is O(m*n).
   * <p>
   * This implementation is based on the Longest Commons Substring algorithm from https://en.wikipedia.org/wiki/Longest_common_subsequence_problem.
   * <p>
   * Usage:  SELECT longest_common_substring_distance( string1, string2 ) FROM...
   */

  @FunctionTemplate(name = "longest_common_substring_distance", scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL)
  public static class LongestCommonSubstringDistanceFunction implements DrillSimpleFunc {

    @Param
    VarCharHolder rawInput1;

    @Param
    VarCharHolder rawInput2;

    @Output
    Float8Holder out;

    @Workspace
    org.apache.commons.text.similarity.LongestCommonSubsequenceDistance d;

    @Override
    public void setup() {
      d = new org.apache.commons.text.similarity.LongestCommonSubsequenceDistance();
    }

    @Override
    public void eval() {

      String input1 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start, rawInput1.end, rawInput1.buffer);
      String input2 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start, rawInput2.end, rawInput2.buffer);

      double result = d.apply(input1, input2);
      out.value = result;
    }
  }

}