Add MATCH_RECOGNIZE related UDFs to the community UDFs. (#410)

GoogleCloudPlatform · May 15, 2024 · 733cc4b · 733cc4b
1 parent 882406e
commit 733cc4b
Show file tree

Hide file tree

Showing 4 changed files with 238 additions and 0 deletions.
diff --git a/udfs/community/README.md b/udfs/community/README.md
@@ -40,6 +40,7 @@ SELECT bqutil.fn.int(1.684)
 * [cw_comparable_format_varchar_t](#cw_comparable_format_varchar_tpart-string)
 * [cw_convert_base](#cw_convert_basenumber-string-from_base-int64-to_base-int64)
 * [cw_csvld](#cw_csvldtext-string-comma-string-quote-string-len-int64)
+* [cw_disjoint_partition_by_regexp](#cw_disjoint_partition_by_regexpfirstrn-int64-haystack-string-regex-string)
 * [cw_editdistance](#cw_editdistancea-string-b-string)
 * [cw_error_number](cw_error_numbererrmsg-string)
 * [cw_error_severity](cw_error_severityerrmsg-string)
@@ -65,6 +66,7 @@ SELECT bqutil.fn.int(1.684)
 * [cw_nvp2json3](#cw_nvp2json3nvp-string-name_delim-string-val_delim-string)
 * [cw_nvp2json4](#cw_nvp2json4nvp-string-name_delim-string-val_delim-string-ignore_char-string)
 * [cw_otranslate](#cw_otranslates-string-key-string-value-string)
+* [cw_overlapping_partition_by_regexp](#cw_overlapping_partition_by_regexpfirstrn-int64-haystack-string-regex-string)
 * [cw_period_intersection](#cw_period_intersectionp1-structlower-timestamp-upper-timestamp-p2-structlower-timestamp-upper-timestamp)
 * [cw_period_ldiff](#cw_period_ldiffp1-structlower-timestamp-upper-timestamp-p2-structlower-timestamp-upper-timestamp)
 * [cw_period_rdiff](#cw_period_rdiffp1-structlower-timestamp-upper-timestamp-p2-structlower-timestamp-upper-timestamp)
@@ -445,6 +447,22 @@ SELECT bqutil.fn.cw_csvld('Test#123', '#', '"', 2);
 ["Test", "123"]
 ```
 
+### [cw_disjoint_partition_by_regexp(firstRn INT64, haystack STRING, regex STRING)](cw_disjoint_partition_by_regexp.sqlx)
+Partitions rows into disjoint segments by matching their sequence with the provided regex pattern.
+```sql
+SELECT bqutil.fn.cw_disjoint_partition_by_regexp(1, 'A@1#A@2#B@3#A@4#B@5#', '(?:A@\\d+#)+(?:B@\\d+#)')
+SELECT bqutil.fn.cw_disjoint_partition_by_regexp(2, 'A@1#A@2#B@3#A@4#B@5#', '(?:A@\\d+#)+(?:B@\\d+#)')
+SELECT bqutil.fn.cw_disjoint_partition_by_regexp(3, 'A@1#A@2#B@3#A@4#B@5#', '(?:A@\\d+#)+(?:B@\\d+#)')
+SELECT bqutil.fn.cw_disjoint_partition_by_regexp(4, 'A@1#A@2#B@3#A@4#B@5#', '(?:A@\\d+#)+(?:B@\\d+#)')
+SELECT bqutil.fn.cw_disjoint_partition_by_regexp(5, 'A@1#A@2#B@3#A@4#B@5#', '(?:A@\\d+#)+(?:B@\\d+#)')
+
+[1, 2, 3]
+[]
+[]
+[4, 5]
+[]
+```
+
 ### [cw_editdistance(a STRING, b STRING)](cw_editdistance.sqlx)
 Similar to teradata's editdistance without weightages
 ```sql
@@ -668,6 +686,23 @@ SELECT bqutil.fn.cw_otranslate('Thin and Thick', 'Thk', 'Sp');
 Spin and Spic
 ```
 
+### [cw_overlapping_partition_by_regexp(firstRn INT64, haystack STRING, regex STRING)](cw_overlapping_partition_by_regexp.sqlx)
+Partitions rows into overlapping segments by matching their sequence with the provided regex pattern.
+```sql
+SELECT bqutil.fn.cw_disjoint_partition_by_regexp(1, 'A@1#A@2#B@3#A@4#B@5#', '(?:A@\\d+#)+(?:B@\\d+#)')
+SELECT bqutil.fn.cw_disjoint_partition_by_regexp(2, 'A@2#B@3#A@4#B@5#', '(?:A@\\d+#)+(?:B@\\d+#)')
+SELECT bqutil.fn.cw_disjoint_partition_by_regexp(3, 'B@3#A@4#B@5#', '(?:A@\\d+#)+(?:B@\\d+#)')
+SELECT bqutil.fn.cw_disjoint_partition_by_regexp(4, 'A@4#B@5#', '(?:A@\\d+#)+(?:B@\\d+#)')
+SELECT bqutil.fn.cw_disjoint_partition_by_regexp(5, 'B@5#', '(?:A@\\d+#)+(?:B@\\d+#)')
+
+[1, 2, 3]
+[2, 3]
+[]
+[4, 5]
+[]
+```
+
+
 ### [cw_period_intersection(p1 STRUCT<lower TIMESTAMP, upper TIMESTAMP>, p2 STRUCT<lower TIMESTAMP, upper TIMESTAMP>)](cw_period_intersection.sqlx)
 ```sql
 SELECT bqutil.fn.cw_period_intersection(

diff --git a/udfs/community/cw_disjoint_partition_by_regexp.sqlx b/udfs/community/cw_disjoint_partition_by_regexp.sqlx
@@ -0,0 +1,52 @@
+config { hasOutput: true }
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CREATE OR REPLACE FUNCTION ${self()}(firstRn INT64, haystack STRING, regex STRING)
+  RETURNS ARRAY<INT64> 
+  OPTIONS(description="""Returns disjoint matches for the regex with the sequence of rows encoded in
+ custom format. The expected input haystack format is
+ "row-1@row-number-1#row-2@row-number-2# ... #row-n@row-number-n#".
+ If the regex matches then, it returns the rows matched by the given pattern
+ if the firstRn is the first row in the given matched subsequence. e.g. if
+ firstRn is 4,  haystack is "A@1#A@2#B@3#A@4#B@5#" and regex is
+ "(?:A@\\d+#)+(?:B@\\d+#)" then it will return [4, 5] since matched subsequence
+ is "A@4#B@5#" starts at firstRn 4, however if you have input firstRn is 5 with
+ the same input then it will return an empty array since row number 5 is part
+ of "A@4#B@5#" but it is not the first row in the subsequence.
+
+ By repeatedly calling cw_disjoint_partition_by_regexp with increasing values of
+ firstRn and keeping haystack constant, this UDF effectively returns disjoint
+ sets of row numbers that match the given regex.
+
+ Continuing the above example, if we call this UDF with firstRn having
+ increasing values, then we get corresponding outputs as follows:
+ firstRn       haystack                output
+ 1             "A@1#A@2#B@3#A@4#B@5#"  [1, 2, 3]
+ 2             "A@1#A@2#B@3#A@4#B@5#"  []
+ 3             "A@1#A@2#B@3#A@4#B@5#"  []
+ 4             "A@1#A@2#B@3#A@4#B@5#"  [4, 5]
+ 5             "A@1#A@2#B@3#A@4#B@5#"  []
+""")
+  AS (
+    (WITH t AS (
+        SELECT
+          MIN(ARRAY_LENGTH(REGEXP_EXTRACT_ALL(m, '@\\d+#'))) AS n_rows
+        FROM UNNEST(REGEXP_EXTRACT_ALL(haystack, regex)) m
+        WHERE REGEXP_EXTRACT(m, '@(\\d+)#') = CAST(firstRn AS STRING)
+      )
+      SELECT GENERATE_ARRAY(firstRn, firstRn + n_rows - 1) FROM t)
+  );
diff --git a/udfs/community/cw_overlapping_partition_by_regexp.sqlx b/udfs/community/cw_overlapping_partition_by_regexp.sqlx
@@ -0,0 +1,49 @@
+config { hasOutput: true }
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CREATE OR REPLACE FUNCTION ${self()}(firstRn INT64, haystack STRING, regex STRING)
+  RETURNS ARRAY<INT64> 
+  OPTIONS(description="""Returns overlapping matches for the regex with the sequence of rows encoded in
+ custom format. The expected input haystack format is
+ "row-1@row-number-1#row-2@row-number-2# ... #row-n@row-number-n#".
+ If the regex matches, then it returns all rows matched at the prefix of the
+ haystack,  e.g. if firstRn is 1, firstRn is "A@1#A@2#B@3#A@4#B@5#" and regex
+ is "(?:A@\\d+#)+(?:B@\\d+#)" then it will return [1, 2, 3] as output since
+ matched subsequence is "A@1#A@2#B@3#". If haystack is "B@3#A@4#B@5#" for the
+ same regex it will return an empty array since regex does not match at the
+ prefix of the haystack.
+
+ By repeatedly calling cw_overlapping_partition_by_regexp with increasing values
+ of firstRn and a substring of sequence starting at firstRn, this UDF
+ effectively returns overlapping sets of row numbers that match the given regex.
+
+ Continuing the above example, if we call this UDF with firstRn having
+ increasing values, and haystack having substring starting with firstRn, then we
+ get corresponding outputs as follows:
+ firstRn    haystack                output
+ 1          "A@1#A@2#B@3#A@4#B@5#"  [1, 2, 3]
+ 2          "A@2#B@3#A@4#B@5#"      [2, 3]
+ 3          "B@3#A@4#B@5#"          []
+ 4          "A@4#B@5#"              [4, 5]
+ 5          "B@5#"                  []
+""")
+  AS (
+   (WITH t AS (
+      SELECT ARRAY_LENGTH(REGEXP_EXTRACT_ALL(REGEXP_EXTRACT(haystack, '^(' || regex || ')'), '@\\d+#')) AS n_rows
+    )
+    SELECT GENERATE_ARRAY(firstRn, firstRn + n_rows- 1) FROM t)
+  );
diff --git a/udfs/community/test_cases.js b/udfs/community/test_cases.js
@@ -3839,3 +3839,105 @@ generate_udf_test("table_url", [
     }
 
   ]);
+
+generate_udf_test("cw_overlapping_partition_by_regexp", [
+    {
+      inputs: [
+        `1`,
+        `"A@1#A@2#B@3#A@4#B@5#"`,
+        `"(?:A@\\\\d+#)+(?:B@\\\\d+#)"`
+      ],
+      expected_output: `CAST([1, 2, 3] AS ARRAY<INT64>)`
+    },
+    {
+      inputs: [
+        `2`,
+        `"A@2#B@3#A@4#B@5#"`,
+        `"(?:A@\\\\d+#)+(?:B@\\\\d+#)"`
+      ],
+      expected_output: `CAST([ 2, 3] AS ARRAY<INT64>)`
+    },
+    {
+      inputs: [
+        `3`,
+        `"B@3#A@4#B@5#"`,
+        `"(?:A@\\\\d+#)+(?:B@\\\\d+#)"`
+      ],
+      expected_output: `CAST([] AS ARRAY<INT64>)`
+    },
+    {
+      inputs: [
+        `4`,
+        `"A@4#B@5#"`,
+        `"(?:A@\\\\d+#)+(?:B@\\\\d+#)"`
+      ],
+      expected_output: `CAST([4, 5] AS ARRAY<INT64>)`
+    },
+    {
+      inputs: [
+        `5`,
+        `"B@5#"`,
+        `"(?:A@\\\\d+#)+(?:B@\\\\d+#)"`
+      ],
+      expected_output: `CAST([] AS ARRAY<INT64>)`
+    },
+    {
+      inputs: [
+        `101`,
+        `"A@101#A@102#B@103#A@104#B@105#"`,
+        `"(?:A@\\\\d+#)+(?:B@\\\\d+#)"`
+      ],
+      expected_output: `CAST([101, 102, 103] AS ARRAY<INT64>)`
+    }
+  ]);
+
+generate_udf_test("cw_disjoint_partition_by_regexp", [
+    {
+      inputs: [
+        `1`,
+        `"A@1#A@2#B@3#A@4#B@5#"`,
+        `"(?:A@\\\\d+#)+(?:B@\\\\d+#)"`
+      ],
+      expected_output: `CAST([1, 2, 3] AS ARRAY<INT64>)`
+    },
+    {
+      inputs: [
+        `2`,
+        `"A@1#A@2#B@3#A@4#B@5#"`,
+        `"(?:A@\\\\d+#)+(?:B@\\\\d+#)"`
+      ],
+      expected_output: `CAST([] AS ARRAY<INT64>)`
+    },
+    {
+      inputs: [
+        `3`,
+        `"A@1#A@2#B@3#A@4#B@5#"`,
+        `"(?:A@\\\\d+#)+(?:B@\\\\d+#)"`
+      ],
+      expected_output: `CAST([] AS ARRAY<INT64>)`
+    },
+    {
+      inputs: [
+        `4`,
+        `"A@1#A@2#B@3#A@4#B@5#"`,
+        `"(?:A@\\\\d+#)+(?:B@\\\\d+#)"`
+      ],
+      expected_output: `CAST([4, 5] AS ARRAY<INT64>)`
+    },
+    {
+      inputs: [
+        `5`,
+        `"A@1#A@2#B@3#A@4#B@5#"`,
+        `"(?:A@\\\\d+#)+(?:B@\\\\d+#)"`
+      ],
+      expected_output: `CAST([] AS ARRAY<INT64>)`
+    },
+    {
+      inputs: [
+        `104`,
+        `"A@101#A@102#B@103#A@104#B@105#"`,
+        `"(?:A@\\\\d+#)+(?:B@\\\\d+#)"`
+      ],
+      expected_output: `CAST([104, 105] AS ARRAY<INT64>)`
+    }
+  ]);