Skip to content

Commit 9481d16

Browse files
committed
Fix conversion of SIMILAR TO regexes for character classes
The code that translates SIMILAR TO pattern matching expressions to POSIX-style regular expressions did not consider that square brackets can be nested. For example, in an expression like [[:alpha:]%_], the logic replaced the placeholders '_' and '%' but it should not. This commit fixes the conversion logic by tracking the nesting level of square brackets marking character class areas, while considering that in expressions like []] or [^]] the first closing square bracket is a regular character. Multiple tests are added to show how the conversions should or should not apply applied while in a character class area, with specific cases added for all the characters converted outside character classes like an opening parenthesis '(', dollar sign '$', etc. Author: Laurenz Albe <laurenz.albe@cybertec.at> Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Reviewed-by: Michael Paquier <michael@paquier.xyz> Discussion: https://postgr.es/m/16ab039d1af455652bdf4173402ddda145f2c73b.camel@cybertec.at Backpatch-through: 13
1 parent 4f49456 commit 9481d16

File tree

3 files changed

+114
-6
lines changed

3 files changed

+114
-6
lines changed

src/backend/utils/adt/regexp.c

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -673,8 +673,11 @@ similar_escape_internal(text *pat_text, text *esc_text)
673673
int plen,
674674
elen;
675675
bool afterescape = false;
676-
bool incharclass = false;
677676
int nquotes = 0;
677+
int charclass_depth = 0; /* Nesting level of character classes,
678+
* encompassed by square brackets */
679+
int charclass_start = 0; /* State of the character class start,
680+
* for carets */
678681

679682
p = VARDATA_ANY(pat_text);
680683
plen = VARSIZE_ANY_EXHDR(pat_text);
@@ -804,7 +807,7 @@ similar_escape_internal(text *pat_text, text *esc_text)
804807
/* fast path */
805808
if (afterescape)
806809
{
807-
if (pchar == '"' && !incharclass) /* escape-double-quote? */
810+
if (pchar == '"' && charclass_depth < 1) /* escape-double-quote? */
808811
{
809812
/* emit appropriate part separator, per notes above */
810813
if (nquotes == 0)
@@ -853,18 +856,41 @@ similar_escape_internal(text *pat_text, text *esc_text)
853856
/* SQL escape character; do not send to output */
854857
afterescape = true;
855858
}
856-
else if (incharclass)
859+
else if (charclass_depth > 0)
857860
{
858861
if (pchar == '\\')
859862
*r++ = '\\';
860863
*r++ = pchar;
861-
if (pchar == ']')
862-
incharclass = false;
864+
865+
/*
866+
* Ignore a closing bracket at the start of a character class.
867+
* Such a bracket is taken literally rather than closing the
868+
* class. "charclass_start" is 1 right at the beginning of a
869+
* class and 2 after an initial caret.
870+
*/
871+
if (pchar == ']' && charclass_start > 2)
872+
charclass_depth--;
873+
else if (pchar == '[')
874+
charclass_depth++;
875+
876+
/*
877+
* If there is a caret right after the opening bracket, it negates
878+
* the character class, but a following closing bracket should
879+
* still be treated as a normal character. That holds only for
880+
* the first caret, so only the values 1 and 2 mean that closing
881+
* brackets should be taken literally.
882+
*/
883+
if (pchar == '^')
884+
charclass_start++;
885+
else
886+
charclass_start = 3; /* definitely past the start */
863887
}
864888
else if (pchar == '[')
865889
{
890+
/* start of a character class */
866891
*r++ = pchar;
867-
incharclass = true;
892+
charclass_depth++;
893+
charclass_start = 1;
868894
}
869895
else if (pchar == '%')
870896
{

src/test/regress/expected/strings.out

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,68 @@ SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
557557
SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
558558
ERROR: invalid escape string
559559
HINT: Escape string must be empty or one character.
560+
-- Characters that should be left alone in character classes when a
561+
-- SIMILAR TO regexp pattern is converted to POSIX style.
562+
-- Underscore "_"
563+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_';
564+
QUERY PLAN
565+
------------------------------------------------
566+
Seq Scan on text_tbl
567+
Filter: (f1 ~ '^(?:.[_[:alpha:]_].)$'::text)
568+
(2 rows)
569+
570+
-- Percentage "%"
571+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%';
572+
QUERY PLAN
573+
--------------------------------------------------
574+
Seq Scan on text_tbl
575+
Filter: (f1 ~ '^(?:.*[%[:alnum:]%].*)$'::text)
576+
(2 rows)
577+
578+
-- Dot "."
579+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].';
580+
QUERY PLAN
581+
--------------------------------------------------
582+
Seq Scan on text_tbl
583+
Filter: (f1 ~ '^(?:\.[.[:alnum:].]\.)$'::text)
584+
(2 rows)
585+
586+
-- Dollar "$"
587+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$';
588+
QUERY PLAN
589+
--------------------------------------------------
590+
Seq Scan on text_tbl
591+
Filter: (f1 ~ '^(?:\$[$[:alnum:]$]\$)$'::text)
592+
(2 rows)
593+
594+
-- Opening parenthesis "("
595+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '([([:alnum:](](';
596+
ERROR: invalid regular expression: parentheses () not balanced
597+
-- Caret "^"
598+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^';
599+
QUERY PLAN
600+
------------------------------------------------------------------------
601+
Seq Scan on text_tbl
602+
Filter: (f1 ~ '^(?:\^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]\^)$'::text)
603+
(2 rows)
604+
605+
-- Closing square bracket "]" at the beginning of character class
606+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%';
607+
QUERY PLAN
608+
------------------------------------------------
609+
Seq Scan on text_tbl
610+
Filter: (f1 ~ '^(?:[]%][^]%][^%].*)$'::text)
611+
(2 rows)
612+
613+
-- Closing square bracket effective after two carets at the beginning
614+
-- of character class.
615+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^';
616+
QUERY PLAN
617+
---------------------------------------
618+
Seq Scan on text_tbl
619+
Filter: (f1 ~ '^(?:[^^]\^)$'::text)
620+
(2 rows)
621+
560622
-- Test back reference in regexp_replace
561623
SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
562624
regexp_replace

src/test/regress/sql/strings.sql

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,26 @@ SELECT 'abcd\efg' SIMILAR TO '_bcd\%' ESCAPE '' AS true;
183183
SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
184184
SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
185185

186+
-- Characters that should be left alone in character classes when a
187+
-- SIMILAR TO regexp pattern is converted to POSIX style.
188+
-- Underscore "_"
189+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_';
190+
-- Percentage "%"
191+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%';
192+
-- Dot "."
193+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].';
194+
-- Dollar "$"
195+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$';
196+
-- Opening parenthesis "("
197+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '([([:alnum:](](';
198+
-- Caret "^"
199+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^';
200+
-- Closing square bracket "]" at the beginning of character class
201+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%';
202+
-- Closing square bracket effective after two carets at the beginning
203+
-- of character class.
204+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^';
205+
186206
-- Test back reference in regexp_replace
187207
SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
188208
SELECT regexp_replace('AAA BBB CCC ', E'\\s+', ' ', 'g');

0 commit comments

Comments
 (0)