Skip to content

Commit af01c25

Browse files
feodordmpgpro
authored andcommitted
Make exact distance match for FTS phrase operator
Phrase operator now requires exact distance betweens lexems instead of less-or-equal. Per discussion c19fcfec308e6ccd952cdde9e648b505@mail.gmail.com Conflicts: doc/src/sgml/textsearch.sgml src/backend/utils/adt/tsvector_op.c
1 parent f4e9a02 commit af01c25

File tree

4 files changed

+122
-52
lines changed

4 files changed

+122
-52
lines changed

doc/src/sgml/textsearch.sgml

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,57 @@ text @@ text
344344
The form <type>text</type> <literal>@@</literal> <type>text</type>
345345
is equivalent to <literal>to_tsvector(x) @@ plainto_tsquery(y)</literal>.
346346
</para>
347+
348+
<para>
349+
Within a <type>tsquery</>, the <literal>&amp;</literal> (AND) operator
350+
specifies that both its arguments must appear in the document to have a
351+
match. Similarly, the <literal>|</literal> (OR) operator specifies that
352+
at least one of its arguments must appear, while the <literal>!</> (NOT)
353+
operator specifies that its argument must <emphasis>not</> appear in
354+
order to have a match. Parentheses can be used to control nesting of
355+
these operators.
356+
</para>
357+
358+
<para>
359+
Searching for phrases is possible with the help of
360+
the <literal>&lt;-&gt;</> (FOLLOWED BY) <type>tsquery</> operator, which
361+
matches only if its arguments have matches that are adjacent and in the
362+
given order. For example:
363+
364+
<programlisting>
365+
SELECT to_tsvector('fatal error') @@ to_tsquery('fatal &lt;-&gt; error');
366+
?column?
367+
----------
368+
t
369+
370+
SELECT to_tsvector('error is not fatal') @@ to_tsquery('fatal &lt;-&gt; error');
371+
?column?
372+
----------
373+
f
374+
</programlisting>
375+
376+
There is a more general version of the FOLLOWED BY operator having the
377+
form <literal>&lt;<replaceable>N</>&gt;</literal>,
378+
where <replaceable>N</> is an integer standing for the exact distance
379+
allowed between the matching lexemes. <literal>&lt;1&gt;</literal> is
380+
the same as <literal>&lt;-&gt;</>, while <literal>&lt;2&gt;</literal>
381+
allows one other lexeme to appear between the matches, and so
382+
on. The <literal>phraseto_tsquery</> function makes use of this
383+
operator to construct a <literal>tsquery</> that can match a multi-word
384+
phrase when some of the words are stop words. For example:
385+
386+
<programlisting>
387+
SELECT phraseto_tsquery('cats ate rats');
388+
phraseto_tsquery
389+
-------------------------------
390+
( 'cat' &lt;-&gt; 'ate' ) &lt;-&gt; 'rat'
391+
392+
SELECT phraseto_tsquery('the cats ate the rats');
393+
phraseto_tsquery
394+
-------------------------------
395+
( 'cat' &lt;-&gt; 'ate' ) &lt;2&gt; 'rat'
396+
</programlisting>
397+
</para>
347398
</sect2>
348399

349400
<sect2 id="textsearch-intro-configurations">
@@ -1501,8 +1552,11 @@ SELECT to_tsquery('fat') &lt;-&gt; to_tsquery('cat | rat');
15011552

15021553
<listitem>
15031554
<para>
1504-
Returns the distanced phrase-concatenation of the two given queries.
1505-
This function lies in the implementation of the <literal>&lt;-&gt;</> operator.
1555+
Returns a query that searches for a match to the first given query
1556+
followed by a match to the second given query at a distance of at
1557+
<replaceable>distance</replaceable> lexemes, using
1558+
the <literal>&lt;<replaceable>N</>&gt;</literal>
1559+
<type>tsquery</> operator. For example:
15061560

15071561
<screen>
15081562
SELECT tsquery_phrase(to_tsquery('fat'), to_tsquery('cat'), 10);

src/backend/utils/adt/tsvector_op.c

Lines changed: 43 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -788,11 +788,12 @@ TS_phrase_execute(QueryItem *curitem,
788788
}
789789
else
790790
{
791-
ExecPhraseData Ldata = {0, false, NULL},
792-
Rdata = {0, false, NULL};
793-
WordEntryPos *Lpos,
794-
*Rpos,
795-
*pos_iter = NULL;
791+
ExecPhraseData Ldata = {0, false, NULL},
792+
Rdata = {0, false, NULL};
793+
WordEntryPos *Lpos,
794+
*LposStart,
795+
*Rpos,
796+
*pos_iter = NULL;
796797

797798
Assert(curitem->qoperator.oper == OP_PHRASE);
798799

@@ -830,52 +831,60 @@ TS_phrase_execute(QueryItem *curitem,
830831
pos_iter = data->pos;
831832
}
832833

833-
Lpos = Ldata.pos;
834-
Rpos = Rdata.pos;
835-
836834
/*
837835
* Find matches by distance, WEP_GETPOS() is needed because
838836
* ExecPhraseData->data can point to the tsvector's WordEntryPosVector
839837
*/
840838

839+
Rpos = Rdata.pos;
840+
LposStart = Ldata.pos;
841841
while (Rpos < Rdata.pos + Rdata.npos)
842842
{
843+
/*
844+
* We need to check all possible distances, so reset Lpos
845+
* to guranteed not yet satisfied position.
846+
*/
847+
Lpos = LposStart;
843848
while (Lpos < Ldata.pos + Ldata.npos)
844849
{
845-
if (WEP_GETPOS(*Lpos) <= WEP_GETPOS(*Rpos))
850+
if (WEP_GETPOS(*Rpos) - WEP_GETPOS(*Lpos) ==
851+
curitem->qoperator.distance)
846852
{
847-
/*
848-
* Lpos is behind the Rpos, so we have to check the
849-
* distance condition
850-
*/
851-
if (WEP_GETPOS(*Rpos) - WEP_GETPOS(*Lpos) <= curitem->qoperator.distance)
853+
/* MATCH! */
854+
if (data)
852855
{
853-
/* MATCH! */
854-
if (data)
855-
{
856-
*pos_iter = WEP_GETPOS(*Rpos);
857-
pos_iter++;
858-
859-
break; /* We need to build a unique result
860-
* array, so go to the next Rpos */
861-
}
862-
else
863-
{
864-
/*
865-
* We are in the root of the phrase tree and hence
866-
* we don't have to store the resulting positions
867-
*/
868-
return true;
869-
}
856+
/* Store position for upper phrase operator */
857+
*pos_iter = WEP_GETPOS(*Rpos);
858+
pos_iter++;
859+
860+
/*
861+
* Set left start position to next, because current one
862+
* could not satisfy distance for any other right
863+
* position
864+
*/
865+
LposStart = Lpos + 1;
866+
break;
870867
}
868+
else
869+
{
870+
/*
871+
* We are in the root of the phrase tree and hence
872+
* we don't have to store the resulting positions
873+
*/
874+
return true;
875+
}
876+
871877
}
872-
else
878+
else if (WEP_GETPOS(*Rpos) <= WEP_GETPOS(*Lpos) ||
879+
WEP_GETPOS(*Rpos) - WEP_GETPOS(*Lpos) <
880+
curitem->qoperator.distance)
873881
{
874882
/*
875-
* Go to the next Rpos, because Lpos
876-
* is ahead of the current Rpos
883+
* Go to the next Rpos, because Lpos is ahead or on less
884+
* distance than required by current operator
877885
*/
878886
break;
887+
879888
}
880889

881890
Lpos++;

src/test/regress/expected/tstypes.out

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -683,10 +683,10 @@ SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <-> 2' AS "true";
683683
t
684684
(1 row)
685685

686-
SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 2' AS "true";
687-
true
688-
------
689-
t
686+
SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 2' AS "false";
687+
false
688+
-------
689+
f
690690
(1 row)
691691

692692
SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <-> 3' AS "false";
@@ -701,6 +701,12 @@ SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 3' AS "true";
701701
t
702702
(1 row)
703703

704+
SELECT to_tsvector('simple', '1 2 1 2') @@ '1 <3> 2' AS "true";
705+
true
706+
------
707+
t
708+
(1 row)
709+
704710
SELECT to_tsvector('simple', '1 2 11 3') @@ '1 <-> 3' AS "false";
705711
false
706712
-------
@@ -915,7 +921,7 @@ SELECT ts_rank_cd(' a:1 sa:2A sb:2D g'::tsvector, 'a <-> s:*');
915921
SELECT ts_rank_cd(' a:1 sa:2A sb:2D g'::tsvector, 'a <-> s:* <-> sa:A');
916922
ts_rank_cd
917923
------------
918-
0.0714286
924+
0
919925
(1 row)
920926

921927
SELECT ts_rank_cd(' a:1 sa:2A sb:2D g'::tsvector, 'a <-> s:* <-> sa:B');
@@ -942,10 +948,10 @@ SELECT 'a:1 b:2'::tsvector @@ 'a <1> b'::tsquery AS "true";
942948
t
943949
(1 row)
944950

945-
SELECT 'a:1 b:2'::tsvector @@ 'a <2> b'::tsquery AS "true";
946-
true
947-
------
948-
t
951+
SELECT 'a:1 b:2'::tsvector @@ 'a <2> b'::tsquery AS "false";
952+
false
953+
-------
954+
f
949955
(1 row)
950956

951957
SELECT 'a:1 b:3'::tsvector @@ 'a <-> b'::tsquery AS "false";
@@ -972,9 +978,9 @@ SELECT 'a:1 b:3'::tsvector @@ 'a <2> b'::tsquery AS "true";
972978
t
973979
(1 row)
974980

975-
SELECT 'a:1 b:3'::tsvector @@ 'a <3> b'::tsquery AS "true";
976-
true
977-
------
978-
t
981+
SELECT 'a:1 b:3'::tsvector @@ 'a <3> b'::tsquery AS "false";
982+
false
983+
-------
984+
f
979985
(1 row)
980986

src/test/regress/sql/tstypes.sql

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -133,9 +133,10 @@ SELECT 'supeznova supernova'::tsvector @@ 'super:*'::tsquery AS "true";
133133

134134
--phrase search
135135
SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <-> 2' AS "true";
136-
SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 2' AS "true";
136+
SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 2' AS "false";
137137
SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <-> 3' AS "false";
138138
SELECT to_tsvector('simple', '1 2 3 1') @@ '1 <2> 3' AS "true";
139+
SELECT to_tsvector('simple', '1 2 1 2') @@ '1 <3> 2' AS "true";
139140

140141
SELECT to_tsvector('simple', '1 2 11 3') @@ '1 <-> 3' AS "false";
141142
SELECT to_tsvector('simple', '1 2 11 3') @@ '1:* <-> 3' AS "true";
@@ -183,10 +184,10 @@ SELECT ts_rank_cd(' a:1 sa:2A sb:2D g'::tsvector, 'a <-> s:* <-> sa:B');
183184
SELECT 'a:1 b:2'::tsvector @@ 'a <-> b'::tsquery AS "true";
184185
SELECT 'a:1 b:2'::tsvector @@ 'a <0> b'::tsquery AS "false";
185186
SELECT 'a:1 b:2'::tsvector @@ 'a <1> b'::tsquery AS "true";
186-
SELECT 'a:1 b:2'::tsvector @@ 'a <2> b'::tsquery AS "true";
187+
SELECT 'a:1 b:2'::tsvector @@ 'a <2> b'::tsquery AS "false";
187188
SELECT 'a:1 b:3'::tsvector @@ 'a <-> b'::tsquery AS "false";
188189
SELECT 'a:1 b:3'::tsvector @@ 'a <0> b'::tsquery AS "false";
189190
SELECT 'a:1 b:3'::tsvector @@ 'a <1> b'::tsquery AS "false";
190191
SELECT 'a:1 b:3'::tsvector @@ 'a <2> b'::tsquery AS "true";
191-
SELECT 'a:1 b:3'::tsvector @@ 'a <3> b'::tsquery AS "true";
192+
SELECT 'a:1 b:3'::tsvector @@ 'a <3> b'::tsquery AS "false";
192193

0 commit comments

Comments
 (0)