diff --git a/contrib/unaccent/expected/unaccent.out b/contrib/unaccent/expected/unaccent.out index b93105e9c7..37b9337635 100644 --- a/contrib/unaccent/expected/unaccent.out +++ b/contrib/unaccent/expected/unaccent.out @@ -61,3 +61,14 @@ SELECT ts_lexize('unaccent', ' {????} (1 row) +CREATE TEXT SEARCH CONFIGURATION unaccent( + COPY=russian +); +ALTER TEXT SEARCH CONFIGURATION unaccent ALTER MAPPING FOR + asciiword, word WITH unaccent MAP russian_stem; +SELECT to_tsvector('unaccent', 'foobar ????? ????'); + to_tsvector +------------------------------ + 'foobar':1 '?????':2 '???':3 +(1 row) + diff --git a/contrib/unaccent/sql/unaccent.sql b/contrib/unaccent/sql/unaccent.sql index 310213994f..6ce21cdfcd 100644 --- a/contrib/unaccent/sql/unaccent.sql +++ b/contrib/unaccent/sql/unaccent.sql @@ -2,7 +2,6 @@ CREATE EXTENSION unaccent; -- must have a UTF8 database SELECT getdatabaseencoding(); - SET client_encoding TO 'KOI8'; SELECT unaccent('foobar'); @@ -16,3 +15,12 @@ SELECT unaccent('unaccent', ' SELECT ts_lexize('unaccent', 'foobar'); SELECT ts_lexize('unaccent', '????'); SELECT ts_lexize('unaccent', '????'); + +CREATE TEXT SEARCH CONFIGURATION unaccent( + COPY=russian +); + +ALTER TEXT SEARCH CONFIGURATION unaccent ALTER MAPPING FOR + asciiword, word WITH unaccent MAP russian_stem; + +SELECT to_tsvector('unaccent', 'foobar ????? ????'); diff --git a/doc/src/sgml/ref/alter_tsconfig.sgml b/doc/src/sgml/ref/alter_tsconfig.sgml index ebe0b94b27..ecc37044a9 100644 --- a/doc/src/sgml/ref/alter_tsconfig.sgml +++ b/doc/src/sgml/ref/alter_tsconfig.sgml @@ -21,8 +21,12 @@ PostgreSQL documentation +ALTER TEXT SEARCH CONFIGURATION name + ADD MAPPING FOR token_type [, ... ] WITH config ALTER TEXT SEARCH CONFIGURATION name ADD MAPPING FOR token_type [, ... ] WITH dictionary_name [, ... ] +ALTER TEXT SEARCH CONFIGURATION name + ALTER MAPPING FOR token_type [, ... ] WITH config ALTER TEXT SEARCH CONFIGURATION name ALTER MAPPING FOR token_type [, ... ] WITH dictionary_name [, ... ] ALTER TEXT SEARCH CONFIGURATION name @@ -88,6 +92,17 @@ ALTER TEXT SEARCH CONFIGURATION name SET SCHEMA + + config + + + The dictionaries tree expression. The dictionary expression + is a triple of condition/command/else that define way to process + the text. The ELSE part is optional. + + + + old_dictionary @@ -133,7 +148,7 @@ ALTER TEXT SEARCH CONFIGURATION name SET SCHEMA - + The ADD MAPPING FOR form installs a list of dictionaries to be @@ -154,6 +169,53 @@ ALTER TEXT SEARCH CONFIGURATION name SET SCHEMA + + Dictionaries Map Configuration + + + Format + + Formally config is one of: + + + * dictionary_name + + * config { UNION | INTERSECT | EXCEPT | MAP } config + + * CASE config + WHEN [ NO ] MATCH THEN { KEEP | config } + [ ELSE config ] + END + + + + + Description + + config can be used + in three different formats. The most simple format is name of dictionary to + use for tokens processing. + + + In order to use more than one dictionary + simultaneously user should interconnect dictionaries by operators. Operators + UNION, EXCEPT and + INTERSECT have same meaning as in operations on sets. + Special operator MAP gets output of left subexpression + and uses it as an input to right subexpression. + + + The third format of config is similar to + CASE/WHEN/THEN/ELSE structure. It's consists of three + replaceable parts. First one is configuration which is used to construct lexemes set + for matching condition. If the condition is triggered, the command is executed. + Use command KEEP to avoid repeating of the same + configuration in condition and command part. However, command may differ from + the condition. The ELSE branch is executed otherwise. + + + + Examples @@ -167,6 +229,34 @@ ALTER TEXT SEARCH CONFIGURATION name SET SCHEMA + + + Next example shows how to analyse documents in both English and German languages. + english_hunspell and german_hunspell + return result only if a word is recognized. Otherwise, stemmer dictionaries + are used to process a token. + + + +ALTER TEXT SEARCH CONFIGURATION my_config + ALTER MAPPING FOR asciiword, word WITH + CASE english_hunspell WHEN MATCH THEN KEEP ELSE english_stem END + UNION + CASE german_hunspell WHEN MATCH THEN KEEP ELSE german_stem END; + + + + In order to combine search for both exact and processed forms the vector + should contain lexemes produced by simple for exact form + of the word as well as lexemes produced by linguistic-aware dictionary + (e.g. english_stem) for processed forms. + + + +ALTER TEXT SEARCH CONFIGURATION my_config + ALTER MAPPING FOR asciiword, word WITH english_stem UNION simple; + + diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index 610b7bf033..1253b41f53 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -732,10 +732,11 @@ SELECT to_tsvector('english', 'a fat cat sat on a mat - it ate a fat rats'); The to_tsvector function internally calls a parser which breaks the document text into tokens and assigns a type to each token. For each token, a list of - dictionaries () is consulted, - where the list can vary depending on the token type. The first dictionary - that recognizes the token emits one or more normalized - lexemes to represent the token. For example, + condition/command pairs is consulted, where the list can vary depending + on the token type, condition and command are expressions on dictionaries + with matching clause in condition(). + The first command combined with true-resulted condition emits one or more normalized + lexemes to represent the token. For example, rats became rat because one of the dictionaries recognized that the word rats is a plural form of rat. Some words are recognized as @@ -743,7 +744,7 @@ SELECT to_tsvector('english', 'a fat cat sat on a mat - it ate a fat rats'); causes them to be ignored since they occur too frequently to be useful in searching. In our example these are a, on, and it. - If no dictionary in the list recognizes the token then it is also ignored. + If none of conditions is true the token is ignored. In this example that happened to the punctuation sign - because there are in fact no dictionaries assigned for its token type (Space symbols), meaning space tokens will never be @@ -2232,8 +2233,8 @@ SELECT alias, description, token FROM ts_debug('http://example.com/stuff/index.h a single lexeme with the TSL_FILTER flag set, to replace the original token with a new token to be passed to subsequent - dictionaries (a dictionary that does this is called a - filtering dictionary) + dictionaries in a comma-separated syntax (a dictionary that does this + is called a filtering dictionary) @@ -2265,38 +2266,126 @@ SELECT alias, description, token FROM ts_debug('http://example.com/stuff/index.h type that the parser can return, a separate list of dictionaries is specified by the configuration. When a token of that type is found by the parser, each dictionary in the list is consulted in turn, - until some dictionary recognizes it as a known word. If it is identified - as a stop word, or if no dictionary recognizes the token, it will be - discarded and not indexed or searched for. - Normally, the first dictionary that returns a non-NULL - output determines the result, and any remaining dictionaries are not - consulted; but a filtering dictionary can replace the given word - with a modified word, which is then passed to subsequent dictionaries. + until command is not selected based on its condition. If none of cases is + selected token will be discarded and not indexed or searched for. - The general rule for configuring a list of dictionaries - is to place first the most narrow, most specific dictionary, then the more - general dictionaries, finishing with a very general dictionary, like + A tree of cases is described as condition/command/else triples. Each + condition is evaluated in order to select appropriate command to generate + resulted set of lexemes. + + + + A condition is an expression with dictionaries used as operands and + basic set operators UNION, EXCEPT, INTERSECT + and special operator MAP. + Special operator MAP use output of left subexpression as + input for right subexpression. + + + + Rules to write command are same as for condition with additional keyword + KEEP considered to use the result of the condition as an output. + + + + A comma-separated list of dictionaries is a simplified variant of text + search configuration. Each dictionary consulted to process a token and first + non-NULL output is accepted as a processing result. + + + + The general rule for configuring tokens processing + is to place first case with the most narrow, most specific dictionary, then the more + general dictionaries, finishing with a very general dictionaries, like a Snowball stemmer or simple, which - recognizes everything. For example, for an astronomy-specific search + recognizes everything. For example, for an astronomy-specific search (astro_en configuration) one could bind token type asciiword (ASCII word) to a synonym dictionary of astronomical terms, a general English dictionary and a Snowball English - stemmer: + stemmer in comma-separated variant of mapping: + ALTER TEXT SEARCH CONFIGURATION astro_en ADD MAPPING FOR asciiword WITH astrosyn, english_ispell, english_stem; + + + Another example is a configuration for both English and German languages via + operator-separated variant of mapping: + + + +ALTER TEXT SEARCH CONFIGURATION multi_en_de + ADD MAPPING FOR asciiword, word WITH + CASE english_hunspell WHEN MATCH THEN KEEP ELSE english_stem END + UNION + CASE german_hunspell WHEN MATCH THEN KEEP ELSE german_stem END; + + + + This configuration provides an ability to search on collection of multilingual + documents without specifying language: + + + +WITH docs(id, txt) as (values (1, 'Das geschah zu Beginn dieses Monats'), + (2, 'with old stars and lacking gas and dust'), + (3, '25 light-years across, blown bywinds from its central')) +SELECT * FROM docs WHERE to_tsvector('multi_en_de', txt) @@ to_tsquery('multi_en_de', 'lack'); + id | txt +----+----------------------------------------- + 2 | with old stars and lacking gas and dust + +WITH docs(id, txt) as (values (1, 'Das geschah zu Beginn dieses Monats'), + (2, 'with old stars and lacking gas and dust'), + (3, '25 light-years across, blown bywinds from its central')) +SELECT * FROM docs WHERE to_tsvector('multi_en_de', txt) @@ to_tsquery('multi_en_de', 'beginnen'); + id | txt +----+------------------------------------- + 1 | Das geschah zu Beginn dieses Monats + + + + A combination of stemmer dictionary with simple one may be used to mix + search for exact form of one word and linguistic search for others. + + + +ALTER TEXT SEARCH CONFIGURATION exact_and_linguistic + ADD MAPPING FOR asciiword, word WITH english_stem UNION simple; + + + + In the following example a simple dictionary is used to prevent words from normalization in query. + +WITH docs(id, txt) as (values (1, 'Supernova star'), + (2, 'Supernova stars')) +SELECT * FROM docs WHERE to_tsvector('exact_and_linguistic', txt) @@ (to_tsquery('simple', 'stars') && to_tsquery('english', 'supernovae')); + id | txt +----+----------------- + 2 | Supernova stars + + + + + Due to lack of information about origin of each lexeme in tsvector may + lead to false-positive triggers in case of stemmed form being used as exact form in a query. + + + - A filtering dictionary can be placed anywhere in the list, except at the - end where it'd be useless. Filtering dictionaries are useful to partially + Filtering dictionaries are useful to partially normalize words to simplify the task of later dictionaries. For example, a filtering dictionary could be used to remove accents from accented letters, as is done by the module. + Filter dictionary should be placed at left of MAP + operator. If filter dictionary returns NULL it pass initial token + to the right subexpression. @@ -2463,9 +2552,9 @@ SELECT ts_lexize('public.simple_dict','The'); SELECT * FROM ts_debug('english', 'Paris'); - alias | description | token | dictionaries | dictionary | lexemes ------------+-----------------+-------+----------------+--------------+--------- - asciiword | Word, all ASCII | Paris | {english_stem} | english_stem | {pari} + alias | description | token | dictionaries | configuration | command | lexemes +-----------+-----------------+-------+----------------+---------------+--------------+--------- + asciiword | Word, all ASCII | Paris | {english_stem} | english_stem | english_stem | {pari} CREATE TEXT SEARCH DICTIONARY my_synonym ( TEMPLATE = synonym, @@ -2477,9 +2566,12 @@ ALTER TEXT SEARCH CONFIGURATION english WITH my_synonym, english_stem; SELECT * FROM ts_debug('english', 'Paris'); - alias | description | token | dictionaries | dictionary | lexemes ------------+-----------------+-------+---------------------------+------------+--------- - asciiword | Word, all ASCII | Paris | {my_synonym,english_stem} | my_synonym | {paris} + alias | description | token | dictionaries | configuration | command | lexemes +-----------+-----------------+-------+---------------------------+---------------------------------------------+------------+--------- + asciiword | Word, all ASCII | Paris | {my_synonym,english_stem} | CASE my_synonym WHEN MATCH THEN KEEP +| my_synonym | {paris} + | | | | ELSE CASE english_stem WHEN MATCH THEN KEEP+| | + | | | | END +| | + | | | | END | | @@ -3104,6 +3196,21 @@ CREATE TEXT SEARCH DICTIONARY english_ispell ( Now we can set up the mappings for words in configuration pg: + +ALTER TEXT SEARCH CONFIGURATION pg + ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, + word, hword, hword_part + WITH + CASE pg_dict WHEN MATCH THEN KEEP + ELSE + CASE english_ispell WHEN MATCH THEN KEEP + ELSE english_stem + END + END; + + + Or use alternative comma-separated syntax: + ALTER TEXT SEARCH CONFIGURATION pg ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, @@ -3183,7 +3290,8 @@ ts_debug( config re OUT description text, OUT token text, OUT dictionaries regdictionary[], - OUT dictionary regdictionary, + OUT configuration text, + OUT command text, OUT lexemes text[]) returns setof record @@ -3227,14 +3335,20 @@ ts_debug( config re - dictionary regdictionary — the dictionary - that recognized the token, or NULL if none did + configuration text — the + configuration defined for this token type + + + + + command text — the command that describes + the way the output was produced lexemes text[] — the lexeme(s) produced - by the dictionary that recognized the token, or NULL if + by the command selected according conditions, or NULL if none did; an empty array ({}) means it was recognized as a stop word @@ -3247,32 +3361,32 @@ ts_debug( config re SELECT * FROM ts_debug('english','a fat cat sat on a mat - it ate a fat rats'); - alias | description | token | dictionaries | dictionary | lexemes ------------+-----------------+-------+----------------+--------------+--------- - asciiword | Word, all ASCII | a | {english_stem} | english_stem | {} - blank | Space symbols | | {} | | - asciiword | Word, all ASCII | fat | {english_stem} | english_stem | {fat} - blank | Space symbols | | {} | | - asciiword | Word, all ASCII | cat | {english_stem} | english_stem | {cat} - blank | Space symbols | | {} | | - asciiword | Word, all ASCII | sat | {english_stem} | english_stem | {sat} - blank | Space symbols | | {} | | - asciiword | Word, all ASCII | on | {english_stem} | english_stem | {} - blank | Space symbols | | {} | | - asciiword | Word, all ASCII | a | {english_stem} | english_stem | {} - blank | Space symbols | | {} | | - asciiword | Word, all ASCII | mat | {english_stem} | english_stem | {mat} - blank | Space symbols | | {} | | - blank | Space symbols | - | {} | | - asciiword | Word, all ASCII | it | {english_stem} | english_stem | {} - blank | Space symbols | | {} | | - asciiword | Word, all ASCII | ate | {english_stem} | english_stem | {ate} - blank | Space symbols | | {} | | - asciiword | Word, all ASCII | a | {english_stem} | english_stem | {} - blank | Space symbols | | {} | | - asciiword | Word, all ASCII | fat | {english_stem} | english_stem | {fat} - blank | Space symbols | | {} | | - asciiword | Word, all ASCII | rats | {english_stem} | english_stem | {rat} + alias | description | token | dictionaries | configuration | command | lexemes +-----------+-----------------+-------+----------------+---------------+--------------+--------- + asciiword | Word, all ASCII | a | {english_stem} | english_stem | english_stem | {} + blank | Space symbols | | | | | + asciiword | Word, all ASCII | fat | {english_stem} | english_stem | english_stem | {fat} + blank | Space symbols | | | | | + asciiword | Word, all ASCII | cat | {english_stem} | english_stem | english_stem | {cat} + blank | Space symbols | | | | | + asciiword | Word, all ASCII | sat | {english_stem} | english_stem | english_stem | {sat} + blank | Space symbols | | | | | + asciiword | Word, all ASCII | on | {english_stem} | english_stem | english_stem | {} + blank | Space symbols | | | | | + asciiword | Word, all ASCII | a | {english_stem} | english_stem | english_stem | {} + blank | Space symbols | | | | | + asciiword | Word, all ASCII | mat | {english_stem} | english_stem | english_stem | {mat} + blank | Space symbols | | | | | + blank | Space symbols | - | | | | + asciiword | Word, all ASCII | it | {english_stem} | english_stem | english_stem | {} + blank | Space symbols | | | | | + asciiword | Word, all ASCII | ate | {english_stem} | english_stem | english_stem | {ate} + blank | Space symbols | | | | | + asciiword | Word, all ASCII | a | {english_stem} | english_stem | english_stem | {} + blank | Space symbols | | | | | + asciiword | Word, all ASCII | fat | {english_stem} | english_stem | english_stem | {fat} + blank | Space symbols | | | | | + asciiword | Word, all ASCII | rats | {english_stem} | english_stem | english_stem | {rat} @@ -3298,13 +3412,22 @@ ALTER TEXT SEARCH CONFIGURATION public.english SELECT * FROM ts_debug('public.english','The Brightest supernovaes'); - alias | description | token | dictionaries | dictionary | lexemes ------------+-----------------+-------------+-------------------------------+----------------+------------- - asciiword | Word, all ASCII | The | {english_ispell,english_stem} | english_ispell | {} - blank | Space symbols | | {} | | - asciiword | Word, all ASCII | Brightest | {english_ispell,english_stem} | english_ispell | {bright} - blank | Space symbols | | {} | | - asciiword | Word, all ASCII | supernovaes | {english_ispell,english_stem} | english_stem | {supernova} + alias | description | token | dictionaries | configuration | command | lexemes +-----------+-----------------+-------------+-------------------------------+---------------------------------------------+------------------+------------- + asciiword | Word, all ASCII | The | {english_ispell,english_stem} | CASE english_ispell WHEN MATCH THEN KEEP +| english_ispell | {} + | | | | ELSE CASE english_stem WHEN MATCH THEN KEEP+| | + | | | | END +| | + | | | | END | | + blank | Space symbols | | | | | + asciiword | Word, all ASCII | Brightest | {english_ispell,english_stem} | CASE english_ispell WHEN MATCH THEN KEEP +| english_ispell | {bright} + | | | | ELSE CASE english_stem WHEN MATCH THEN KEEP+| | + | | | | END +| | + | | | | END | | + blank | Space symbols | | | | | + asciiword | Word, all ASCII | supernovaes | {english_ispell,english_stem} | CASE english_ispell WHEN MATCH THEN KEEP +| english_stem | {supernova} + | | | | ELSE CASE english_stem WHEN MATCH THEN KEEP+| | + | | | | END +| | + | | | | END | | diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index e9e188682f..34b80aea34 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -948,55 +948,14 @@ GRANT SELECT (subdbid, subname, subowner, subenabled, subslotname, subpublicatio -- Tsearch debug function. Defined here because it'd be pretty unwieldy -- to put it into pg_proc.h -CREATE FUNCTION ts_debug(IN config regconfig, IN document text, - OUT alias text, - OUT description text, - OUT token text, - OUT dictionaries regdictionary[], - OUT dictionary regdictionary, - OUT lexemes text[]) -RETURNS SETOF record AS -$$ -SELECT - tt.alias AS alias, - tt.description AS description, - parse.token AS token, - ARRAY ( SELECT m.mapdict::pg_catalog.regdictionary - FROM pg_catalog.pg_ts_config_map AS m - WHERE m.mapcfg = $1 AND m.maptokentype = parse.tokid - ORDER BY m.mapseqno ) - AS dictionaries, - ( SELECT mapdict::pg_catalog.regdictionary - FROM pg_catalog.pg_ts_config_map AS m - WHERE m.mapcfg = $1 AND m.maptokentype = parse.tokid - ORDER BY pg_catalog.ts_lexize(mapdict, parse.token) IS NULL, m.mapseqno - LIMIT 1 - ) AS dictionary, - ( SELECT pg_catalog.ts_lexize(mapdict, parse.token) - FROM pg_catalog.pg_ts_config_map AS m - WHERE m.mapcfg = $1 AND m.maptokentype = parse.tokid - ORDER BY pg_catalog.ts_lexize(mapdict, parse.token) IS NULL, m.mapseqno - LIMIT 1 - ) AS lexemes -FROM pg_catalog.ts_parse( - (SELECT cfgparser FROM pg_catalog.pg_ts_config WHERE oid = $1 ), $2 - ) AS parse, - pg_catalog.ts_token_type( - (SELECT cfgparser FROM pg_catalog.pg_ts_config WHERE oid = $1 ) - ) AS tt -WHERE tt.tokid = parse.tokid -$$ -LANGUAGE SQL STRICT STABLE PARALLEL SAFE; - -COMMENT ON FUNCTION ts_debug(regconfig,text) IS - 'debug function for text search configuration'; CREATE FUNCTION ts_debug(IN document text, OUT alias text, OUT description text, OUT token text, OUT dictionaries regdictionary[], - OUT dictionary regdictionary, + OUT configuration text, + OUT command text, OUT lexemes text[]) RETURNS SETOF record AS $$ diff --git a/src/backend/commands/tsearchcmds.c b/src/backend/commands/tsearchcmds.c index 3a843512d1..53ee576223 100644 --- a/src/backend/commands/tsearchcmds.c +++ b/src/backend/commands/tsearchcmds.c @@ -39,9 +39,12 @@ #include "nodes/makefuncs.h" #include "parser/parse_func.h" #include "tsearch/ts_cache.h" +#include "tsearch/ts_public.h" #include "tsearch/ts_utils.h" +#include "tsearch/ts_configmap.h" #include "utils/builtins.h" #include "utils/fmgroids.h" +#include "utils/jsonb.h" #include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/syscache.h" @@ -935,11 +938,22 @@ makeConfigurationDependencies(HeapTuple tuple, bool removeOld, while (HeapTupleIsValid((maptup = systable_getnext(scan)))) { Form_pg_ts_config_map cfgmap = (Form_pg_ts_config_map) GETSTRUCT(maptup); + TSMapElement *mapdicts = JsonbToTSMap(DatumGetJsonbP(&cfgmap->mapdicts)); + Oid *dictionaryOids = TSMapGetDictionaries(mapdicts); + Oid *currentOid = dictionaryOids; - referenced.classId = TSDictionaryRelationId; - referenced.objectId = cfgmap->mapdict; - referenced.objectSubId = 0; - add_exact_object_address(&referenced, addrs); + while (*currentOid != InvalidOid) + { + referenced.classId = TSDictionaryRelationId; + referenced.objectId = *currentOid; + referenced.objectSubId = 0; + add_exact_object_address(&referenced, addrs); + + currentOid++; + } + + pfree(dictionaryOids); + TSMapElementFree(mapdicts); } systable_endscan(scan); @@ -1091,8 +1105,7 @@ DefineTSConfiguration(List *names, List *parameters, ObjectAddress *copied) mapvalues[Anum_pg_ts_config_map_mapcfg - 1] = cfgOid; mapvalues[Anum_pg_ts_config_map_maptokentype - 1] = cfgmap->maptokentype; - mapvalues[Anum_pg_ts_config_map_mapseqno - 1] = cfgmap->mapseqno; - mapvalues[Anum_pg_ts_config_map_mapdict - 1] = cfgmap->mapdict; + mapvalues[Anum_pg_ts_config_map_mapdicts - 1] = JsonbPGetDatum(&cfgmap->mapdicts); newmaptup = heap_form_tuple(mapRel->rd_att, mapvalues, mapnulls); @@ -1195,7 +1208,7 @@ AlterTSConfiguration(AlterTSConfigurationStmt *stmt) relMap = heap_open(TSConfigMapRelationId, RowExclusiveLock); /* Add or drop mappings */ - if (stmt->dicts) + if (stmt->dicts || stmt->dict_map) MakeConfigurationMapping(stmt, tup, relMap); else if (stmt->tokentype) DropConfigurationMapping(stmt, tup, relMap); @@ -1270,6 +1283,59 @@ getTokenTypes(Oid prsId, List *tokennames) return res; } +/* + * Parse parse node extracted from dictionary mapping and transform it into + * internal representation of dictionary mapping. + */ +static TSMapElement * +ParseTSMapConfig(DictMapElem *elem) +{ + TSMapElement *result = palloc0(sizeof(TSMapElement)); + + if (elem->kind == DICT_MAP_CASE) + { + TSMapCase *caseObject = palloc0(sizeof(TSMapCase)); + DictMapCase *caseASTObject = elem->data; + + caseObject->condition = ParseTSMapConfig(caseASTObject->condition); + caseObject->command = ParseTSMapConfig(caseASTObject->command); + + if (caseASTObject->elsebranch) + caseObject->elsebranch = ParseTSMapConfig(caseASTObject->elsebranch); + + caseObject->match = caseASTObject->match; + + caseObject->condition->parent = result; + caseObject->command->parent = result; + + result->type = TSMAP_CASE; + result->value.objectCase = caseObject; + } + else if (elem->kind == DICT_MAP_EXPRESSION) + { + TSMapExpression *expression = palloc0(sizeof(TSMapExpression)); + DictMapExprElem *expressionAST = elem->data; + + expression->left = ParseTSMapConfig(expressionAST->left); + expression->right = ParseTSMapConfig(expressionAST->right); + expression->operator = expressionAST->oper; + + result->type = TSMAP_EXPRESSION; + result->value.objectExpression = expression; + } + else if (elem->kind == DICT_MAP_KEEP) + { + result->value.objectExpression = NULL; + result->type = TSMAP_KEEP; + } + else if (elem->kind == DICT_MAP_DICTIONARY) + { + result->value.objectDictionary = get_ts_dict_oid(elem->data, false); + result->type = TSMAP_DICTIONARY; + } + return result; +} + /* * ALTER TEXT SEARCH CONFIGURATION ADD/ALTER MAPPING */ @@ -1286,8 +1352,9 @@ MakeConfigurationMapping(AlterTSConfigurationStmt *stmt, Oid prsId; int *tokens, ntoken; - Oid *dictIds; - int ndict; + Oid *dictIds = NULL; + int ndict = 0; + TSMapElement *config = NULL; ListCell *c; prsId = ((Form_pg_ts_config) GETSTRUCT(tup))->cfgparser; @@ -1326,15 +1393,18 @@ MakeConfigurationMapping(AlterTSConfigurationStmt *stmt, /* * Convert list of dictionary names to array of dict OIDs */ - ndict = list_length(stmt->dicts); - dictIds = (Oid *) palloc(sizeof(Oid) * ndict); - i = 0; - foreach(c, stmt->dicts) + if (stmt->dicts) { - List *names = (List *) lfirst(c); + ndict = list_length(stmt->dicts); + dictIds = (Oid *) palloc(sizeof(Oid) * ndict); + i = 0; + foreach(c, stmt->dicts) + { + List *names = (List *) lfirst(c); - dictIds[i] = get_ts_dict_oid(names, false); - i++; + dictIds[i] = get_ts_dict_oid(names, false); + i++; + } } if (stmt->replace) @@ -1356,6 +1426,10 @@ MakeConfigurationMapping(AlterTSConfigurationStmt *stmt, while (HeapTupleIsValid((maptup = systable_getnext(scan)))) { Form_pg_ts_config_map cfgmap = (Form_pg_ts_config_map) GETSTRUCT(maptup); + Datum repl_val[Natts_pg_ts_config_map]; + bool repl_null[Natts_pg_ts_config_map]; + bool repl_repl[Natts_pg_ts_config_map]; + HeapTuple newtup; /* * check if it's one of target token types @@ -1379,25 +1453,21 @@ MakeConfigurationMapping(AlterTSConfigurationStmt *stmt, /* * replace dictionary if match */ - if (cfgmap->mapdict == dictOld) - { - Datum repl_val[Natts_pg_ts_config_map]; - bool repl_null[Natts_pg_ts_config_map]; - bool repl_repl[Natts_pg_ts_config_map]; - HeapTuple newtup; - - memset(repl_val, 0, sizeof(repl_val)); - memset(repl_null, false, sizeof(repl_null)); - memset(repl_repl, false, sizeof(repl_repl)); - - repl_val[Anum_pg_ts_config_map_mapdict - 1] = ObjectIdGetDatum(dictNew); - repl_repl[Anum_pg_ts_config_map_mapdict - 1] = true; - - newtup = heap_modify_tuple(maptup, - RelationGetDescr(relMap), - repl_val, repl_null, repl_repl); - CatalogTupleUpdate(relMap, &newtup->t_self, newtup); - } + config = JsonbToTSMap(DatumGetJsonbP(&cfgmap->mapdicts)); + TSMapReplaceDictionary(config, dictOld, dictNew); + + memset(repl_val, 0, sizeof(repl_val)); + memset(repl_null, false, sizeof(repl_null)); + memset(repl_repl, false, sizeof(repl_repl)); + + repl_val[Anum_pg_ts_config_map_mapdicts - 1] = JsonbPGetDatum(TSMapToJsonb(config)); + repl_repl[Anum_pg_ts_config_map_mapdicts - 1] = true; + + newtup = heap_modify_tuple(maptup, + RelationGetDescr(relMap), + repl_val, repl_null, repl_repl); + CatalogTupleUpdate(relMap, &newtup->t_self, newtup); + pfree(config); } systable_endscan(scan); @@ -1407,24 +1477,22 @@ MakeConfigurationMapping(AlterTSConfigurationStmt *stmt, /* * Insertion of new entries */ + config = ParseTSMapConfig(stmt->dict_map); + for (i = 0; i < ntoken; i++) { - for (j = 0; j < ndict; j++) - { - Datum values[Natts_pg_ts_config_map]; - bool nulls[Natts_pg_ts_config_map]; + Datum values[Natts_pg_ts_config_map]; + bool nulls[Natts_pg_ts_config_map]; - memset(nulls, false, sizeof(nulls)); - values[Anum_pg_ts_config_map_mapcfg - 1] = ObjectIdGetDatum(cfgId); - values[Anum_pg_ts_config_map_maptokentype - 1] = Int32GetDatum(tokens[i]); - values[Anum_pg_ts_config_map_mapseqno - 1] = Int32GetDatum(j + 1); - values[Anum_pg_ts_config_map_mapdict - 1] = ObjectIdGetDatum(dictIds[j]); + memset(nulls, false, sizeof(nulls)); + values[Anum_pg_ts_config_map_mapcfg - 1] = ObjectIdGetDatum(cfgId); + values[Anum_pg_ts_config_map_maptokentype - 1] = Int32GetDatum(tokens[i]); + values[Anum_pg_ts_config_map_mapdicts - 1] = JsonbPGetDatum(TSMapToJsonb(config)); - tup = heap_form_tuple(relMap->rd_att, values, nulls); - CatalogTupleInsert(relMap, tup); + tup = heap_form_tuple(relMap->rd_att, values, nulls); + CatalogTupleInsert(relMap, tup); - heap_freetuple(tup); - } + heap_freetuple(tup); } } diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index c3efca3c45..a2235c3c0c 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -4439,6 +4439,42 @@ _copyReassignOwnedStmt(const ReassignOwnedStmt *from) return newnode; } +static DictMapElem * +_copyDictMapElem(const DictMapElem *from) +{ + DictMapElem *newnode = makeNode(DictMapElem); + + COPY_SCALAR_FIELD(kind); + COPY_NODE_FIELD(data); + + return newnode; +} + +static DictMapExprElem * +_copyDictMapExprElem(const DictMapExprElem *from) +{ + DictMapExprElem *newnode = makeNode(DictMapExprElem); + + COPY_NODE_FIELD(left); + COPY_NODE_FIELD(right); + COPY_SCALAR_FIELD(oper); + + return newnode; +} + +static DictMapCase * +_copyDictMapCase(const DictMapCase *from) +{ + DictMapCase *newnode = makeNode(DictMapCase); + + COPY_NODE_FIELD(condition); + COPY_NODE_FIELD(command); + COPY_NODE_FIELD(elsebranch); + COPY_SCALAR_FIELD(match); + + return newnode; +} + static AlterTSDictionaryStmt * _copyAlterTSDictionaryStmt(const AlterTSDictionaryStmt *from) { @@ -5452,6 +5488,15 @@ copyObjectImpl(const void *from) case T_ReassignOwnedStmt: retval = _copyReassignOwnedStmt(from); break; + case T_DictMapExprElem: + retval = _copyDictMapExprElem(from); + break; + case T_DictMapElem: + retval = _copyDictMapElem(from); + break; + case T_DictMapCase: + retval = _copyDictMapCase(from); + break; case T_AlterTSDictionaryStmt: retval = _copyAlterTSDictionaryStmt(from); break; diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index 45ceba2830..71a8f9b914 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -2217,6 +2217,36 @@ _equalReassignOwnedStmt(const ReassignOwnedStmt *a, const ReassignOwnedStmt *b) return true; } +static bool +_equalDictMapElem(const DictMapElem *a, const DictMapElem *b) +{ + COMPARE_NODE_FIELD(data); + COMPARE_SCALAR_FIELD(kind); + + return true; +} + +static bool +_equalDictMapExprElem(const DictMapExprElem *a, const DictMapExprElem *b) +{ + COMPARE_NODE_FIELD(left); + COMPARE_NODE_FIELD(right); + COMPARE_SCALAR_FIELD(oper); + + return true; +} + +static bool +_equalDictMapCase(const DictMapCase *a, const DictMapCase *b) +{ + COMPARE_NODE_FIELD(condition); + COMPARE_NODE_FIELD(command); + COMPARE_NODE_FIELD(elsebranch); + COMPARE_SCALAR_FIELD(match); + + return true; +} + static bool _equalAlterTSDictionaryStmt(const AlterTSDictionaryStmt *a, const AlterTSDictionaryStmt *b) { @@ -3575,6 +3605,15 @@ equal(const void *a, const void *b) case T_ReassignOwnedStmt: retval = _equalReassignOwnedStmt(a, b); break; + case T_DictMapExprElem: + retval = _equalDictMapExprElem(a, b); + break; + case T_DictMapElem: + retval = _equalDictMapElem(a, b); + break; + case T_DictMapCase: + retval = _equalDictMapCase(a, b); + break; case T_AlterTSDictionaryStmt: retval = _equalAlterTSDictionaryStmt(a, b); break; diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index b879358de1..84ae8b17f4 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -52,6 +52,7 @@ #include "catalog/namespace.h" #include "catalog/pg_am.h" #include "catalog/pg_trigger.h" +#include "catalog/pg_ts_config_map.h" #include "commands/defrem.h" #include "commands/trigger.h" #include "nodes/makefuncs.h" @@ -241,6 +242,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); PartitionSpec *partspec; PartitionBoundSpec *partboundspec; RoleSpec *rolespec; + DictMapElem *dmapelem; } %type stmt schema_stmt @@ -310,7 +312,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); analyze_option_list analyze_option_elem %type opt_or_replace opt_grant_grant_option opt_grant_admin_option - opt_nowait opt_if_exists opt_with_data + opt_nowait opt_if_exists opt_with_data opt_dictionary_map_no %type opt_nowait_or_skip %type OptRoleList AlterOptRoleList @@ -585,6 +587,12 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type hash_partbound partbound_datum_list range_datum_list %type hash_partbound_elem +%type dictionary_map_set_expr_operator +%type dictionary_map_dict dictionary_map_command_expr_paren + dictionary_config dictionary_map_case + dictionary_map_action opt_dictionary_map_case_else + dictionary_config_comma + %type merge_when_clause opt_and_condition %type merge_when_list %type merge_update merge_delete merge_insert @@ -650,13 +658,13 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); JOIN - KEY + KEEP KEY LABEL LANGUAGE LARGE_P LAST_P LATERAL_P LEADING LEAKPROOF LEAST LEFT LEVEL LIKE LIMIT LISTEN LOAD LOCAL LOCALTIME LOCALTIMESTAMP LOCATION LOCK_P LOCKED LOGGED - MAPPING MATCH MATCHED MATERIALIZED MAXVALUE MERGE METHOD + MAP MAPPING MATCH MATCHED MATERIALIZED MAXVALUE MERGE METHOD MINUTE_P MINVALUE MODE MONTH_P MOVE NAME_P NAMES NATIONAL NATURAL NCHAR NEW NEXT NO NONE @@ -10355,24 +10363,26 @@ AlterTSDictionaryStmt: ; AlterTSConfigurationStmt: - ALTER TEXT_P SEARCH CONFIGURATION any_name ADD_P MAPPING FOR name_list any_with any_name_list + ALTER TEXT_P SEARCH CONFIGURATION any_name ADD_P MAPPING FOR name_list any_with dictionary_config { AlterTSConfigurationStmt *n = makeNode(AlterTSConfigurationStmt); n->kind = ALTER_TSCONFIG_ADD_MAPPING; n->cfgname = $5; n->tokentype = $9; - n->dicts = $11; + n->dict_map = $11; + n->dicts = NULL; n->override = false; n->replace = false; $$ = (Node*)n; } - | ALTER TEXT_P SEARCH CONFIGURATION any_name ALTER MAPPING FOR name_list any_with any_name_list + | ALTER TEXT_P SEARCH CONFIGURATION any_name ALTER MAPPING FOR name_list any_with dictionary_config { AlterTSConfigurationStmt *n = makeNode(AlterTSConfigurationStmt); n->kind = ALTER_TSCONFIG_ALTER_MAPPING_FOR_TOKEN; n->cfgname = $5; n->tokentype = $9; - n->dicts = $11; + n->dict_map = $11; + n->dicts = NULL; n->override = true; n->replace = false; $$ = (Node*)n; @@ -10424,6 +10434,100 @@ any_with: WITH {} | WITH_LA {} ; +opt_dictionary_map_no: + NO { $$ = true; } + | { $$ = false; } + ; + +dictionary_config_comma: + dictionary_map_dict { $$ = $1; } + | dictionary_map_dict ',' dictionary_config_comma + { + DictMapExprElem *n = makeNode(DictMapExprElem); + DictMapElem *r = makeNode(DictMapElem); + + n->left = $1; + n->oper = TSMAP_OP_COMMA; + n->right = $3; + + r->kind = DICT_MAP_EXPRESSION; + r->data = n; + $$ = r; + } + ; + +dictionary_map_action: + KEEP + { + DictMapElem *n = makeNode(DictMapElem); + n->kind = DICT_MAP_KEEP; + n->data = NULL; + $$ = n; + } + | dictionary_config { $$ = $1; } + ; + +opt_dictionary_map_case_else: + ELSE dictionary_config { $$ = $2; } + | { $$ = NULL; } + ; + +dictionary_map_case: + CASE dictionary_config WHEN opt_dictionary_map_no MATCH THEN dictionary_map_action opt_dictionary_map_case_else END_P + { + DictMapCase *n = makeNode(DictMapCase); + DictMapElem *r = makeNode(DictMapElem); + + n->condition = $2; + n->command = $7; + n->elsebranch = $8; + n->match = !$4; + + r->kind = DICT_MAP_CASE; + r->data = n; + $$ = r; + } + ; + +dictionary_map_set_expr_operator: + UNION { $$ = TSMAP_OP_UNION; } + | EXCEPT { $$ = TSMAP_OP_EXCEPT; } + | INTERSECT { $$ = TSMAP_OP_INTERSECT; } + | MAP { $$ = TSMAP_OP_MAP; } + ; + +dictionary_config: + dictionary_map_command_expr_paren { $$ = $1; } + | dictionary_config dictionary_map_set_expr_operator dictionary_map_command_expr_paren + { + DictMapExprElem *n = makeNode(DictMapExprElem); + DictMapElem *r = makeNode(DictMapElem); + + n->left = $1; + n->oper = $2; + n->right = $3; + + r->kind = DICT_MAP_EXPRESSION; + r->data = n; + $$ = r; + } + ; + +dictionary_map_command_expr_paren: + '(' dictionary_config ')' { $$ = $2; } + | dictionary_map_case { $$ = $1; } + | dictionary_config_comma { $$ = $1; } + ; + +dictionary_map_dict: + any_name + { + DictMapElem *n = makeNode(DictMapElem); + n->kind = DICT_MAP_DICTIONARY; + n->data = $1; + $$ = n; + } + ; /***************************************************************************** * @@ -15241,6 +15345,7 @@ unreserved_keyword: | LOCK_P | LOCKED | LOGGED + | MAP | MAPPING | MATCH | MATCHED @@ -15549,6 +15654,7 @@ reserved_keyword: | INITIALLY | INTERSECT | INTO + | KEEP | LATERAL_P | LEADING | LIMIT diff --git a/src/backend/tsearch/Makefile b/src/backend/tsearch/Makefile index 227468ae9e..e61ad4fa1d 100644 --- a/src/backend/tsearch/Makefile +++ b/src/backend/tsearch/Makefile @@ -26,7 +26,7 @@ DICTFILES_PATH=$(addprefix dicts/,$(DICTFILES)) OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \ dict_simple.o dict_synonym.o dict_thesaurus.o \ dict_ispell.o regis.o spell.o \ - to_tsany.o ts_selfuncs.o ts_typanalyze.o ts_utils.o + to_tsany.o ts_selfuncs.o ts_typanalyze.o ts_utils.o ts_configmap.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/tsearch/ts_configmap.c b/src/backend/tsearch/ts_configmap.c new file mode 100644 index 0000000000..714f2a8ab2 --- /dev/null +++ b/src/backend/tsearch/ts_configmap.c @@ -0,0 +1,1114 @@ +/*------------------------------------------------------------------------- + * + * ts_configmap.c + * internal representation of text search configuration and utilities for it + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/tsearch/ts_confimap.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "access/heapam.h" +#include "access/genam.h" +#include "access/htup_details.h" +#include "access/sysattr.h" +#include "catalog/indexing.h" +#include "catalog/pg_ts_dict.h" +#include "catalog/pg_namespace.h" +#include "catalog/namespace.h" +#include "tsearch/ts_cache.h" +#include "tsearch/ts_configmap.h" +#include "utils/builtins.h" +#include "utils/fmgroids.h" + +/* + * Size selected arbitrary, based on assumption that 1024 frames of stack + * is enough for parsing of configurations + */ +#define JSONB_PARSE_STATE_STACK_SIZE 1024 + +/* + * Used during the parsing of TSMapElement from JSONB into internal + * data structures. + */ +typedef enum TSMapParseState +{ + TSMPS_WAIT_ELEMENT, + TSMPS_READ_DICT_OID, + TSMPS_READ_COMPLEX_OBJ, + TSMPS_READ_EXPRESSION, + TSMPS_READ_CASE, + TSMPS_READ_OPERATOR, + TSMPS_READ_COMMAND, + TSMPS_READ_CONDITION, + TSMPS_READ_ELSEBRANCH, + TSMPS_READ_MATCH, + TSMPS_READ_KEEP, + TSMPS_READ_LEFT, + TSMPS_READ_RIGHT +} TSMapParseState; + +/* + * Context used during JSONB parsing to construct a TSMap + */ +typedef struct TSMapJsonbParseData +{ + TSMapParseState states[JSONB_PARSE_STATE_STACK_SIZE]; /* Stack of states of + * JSONB parsing + * automaton */ + int statesIndex; /* Index of current stack frame */ + TSMapElement *element; /* Element that is in construction now */ +} TSMapJsonbParseData; + +static JsonbValue *TSMapElementToJsonbValue(TSMapElement *element, JsonbParseState *jsonbState); +static TSMapElement * JsonbToTSMapElement(JsonbContainer *root); + +/* + * Print name of the namespace into StringInfo variable result + */ +static void +TSMapPrintNamespace(Oid namespaceId, StringInfo result) +{ + Relation maprel; + Relation mapidx; + ScanKeyData mapskey; + SysScanDesc mapscan; + HeapTuple maptup; + Form_pg_namespace namespace; + + if (false) + return; + + maprel = heap_open(NamespaceRelationId, AccessShareLock); + mapidx = index_open(NamespaceOidIndexId, AccessShareLock); + + ScanKeyInit(&mapskey, ObjectIdAttributeNumber, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(namespaceId)); + mapscan = systable_beginscan_ordered(maprel, mapidx, + NULL, 1, &mapskey); + + maptup = systable_getnext_ordered(mapscan, ForwardScanDirection); + namespace = (Form_pg_namespace) GETSTRUCT(maptup); + appendStringInfoString(result, namespace->nspname.data); + appendStringInfoChar(result, '.'); + + systable_endscan_ordered(mapscan); + index_close(mapidx, AccessShareLock); + heap_close(maprel, AccessShareLock); +} + +/* + * Print name of the dictionary into StringInfo variable result + */ +void +TSMapPrintDictName(Oid dictId, StringInfo result) +{ + Relation maprel; + Relation mapidx; + ScanKeyData mapskey; + SysScanDesc mapscan; + HeapTuple maptup; + Form_pg_ts_dict dict; + + if (false) + return; +maprel = heap_open(TSDictionaryRelationId, AccessShareLock); + mapidx = index_open(TSDictionaryOidIndexId, AccessShareLock); + + ScanKeyInit(&mapskey, ObjectIdAttributeNumber, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(dictId)); + mapscan = systable_beginscan_ordered(maprel, mapidx, + NULL, 1, &mapskey); + + maptup = systable_getnext_ordered(mapscan, ForwardScanDirection); + dict = (Form_pg_ts_dict) GETSTRUCT(maptup); + if (!TSDictionaryIsVisible(dictId)) + { + TSMapPrintNamespace(dict->dictnamespace, result); + } + appendStringInfoString(result, dict->dictname.data); + + systable_endscan_ordered(mapscan); + index_close(mapidx, AccessShareLock); + heap_close(maprel, AccessShareLock); +} + +/* + * Print the expression into StringInfo variable result + */ +static void +TSMapPrintExpression(TSMapExpression *expression, StringInfo result) +{ + + Assert(expression->left); + if (expression->left->type == TSMAP_EXPRESSION && + expression->left->value.objectExpression->operator != expression->operator) + { + appendStringInfoChar(result, '('); + } + TSMapPrintElement(expression->left, result); + if (expression->left->type == TSMAP_EXPRESSION && + expression->left->value.objectExpression->operator != expression->operator) + { + appendStringInfoChar(result, ')'); + } + + switch (expression->operator) + { + case TSMAP_OP_UNION: + appendStringInfoString(result, " UNION "); + break; + case TSMAP_OP_EXCEPT: + appendStringInfoString(result, " EXCEPT "); + break; + case TSMAP_OP_INTERSECT: + appendStringInfoString(result, " INTERSECT "); + break; + case TSMAP_OP_COMMA: + appendStringInfoString(result, ", "); + break; + case TSMAP_OP_MAP: + appendStringInfoString(result, " MAP "); + break; + default: + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("text search configuration is invalid"), + errdetail("Text search configuration contains invalid expression operator."))); + break; + } + + Assert(expression->right); + if (expression->right->type == TSMAP_EXPRESSION && + expression->right->value.objectExpression->operator != expression->operator) + { + appendStringInfoChar(result, '('); + } + TSMapPrintElement(expression->right, result); + if (expression->right->type == TSMAP_EXPRESSION && + expression->right->value.objectExpression->operator != expression->operator) + { + appendStringInfoChar(result, ')'); + } +} + +/* + * Print the case configuration construction into StringInfo variable result + */ +static void +TSMapPrintCase(TSMapCase *caseObject, StringInfo result) +{ + appendStringInfoString(result, "CASE "); + + TSMapPrintElement(caseObject->condition, result); + + appendStringInfoString(result, " WHEN "); + if (!caseObject->match) + appendStringInfoString(result, "NO "); + appendStringInfoString(result, "MATCH THEN "); + + TSMapPrintElement(caseObject->command, result); + + if (caseObject->elsebranch != NULL) + { + appendStringInfoString(result, "\nELSE "); + TSMapPrintElement(caseObject->elsebranch, result); + } + appendStringInfoString(result, "\nEND"); +} + +/* + * Print the element into StringInfo result. + * Uses other function and serves for element type detection. + */ +void +TSMapPrintElement(TSMapElement *element, StringInfo result) +{ + switch (element->type) + { + case TSMAP_EXPRESSION: + TSMapPrintExpression(element->value.objectExpression, result); + break; + case TSMAP_DICTIONARY: + TSMapPrintDictName(element->value.objectDictionary, result); + break; + case TSMAP_CASE: + TSMapPrintCase(element->value.objectCase, result); + break; + case TSMAP_KEEP: + appendStringInfoString(result, "KEEP"); + break; + default: + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("text search configuration is invalid"), + errdetail("Text search configuration contains elements with invalid type."))); + break; + } +} + +/* + * Print the text search configuration as a text. + */ +Datum +dictionary_mapping_to_text(PG_FUNCTION_ARGS) +{ + Oid cfgOid = PG_GETARG_OID(0); + int32 tokentype = PG_GETARG_INT32(1); + StringInfo rawResult; + text *result = NULL; + TSConfigCacheEntry *cacheEntry; + + cacheEntry = lookup_ts_config_cache(cfgOid); + rawResult = makeStringInfo(); + initStringInfo(rawResult); + + if (cacheEntry->lenmap > tokentype && cacheEntry->map[tokentype] != NULL) + { + TSMapElement *element = cacheEntry->map[tokentype]; + + TSMapPrintElement(element, rawResult); + } + + result = cstring_to_text(rawResult->data); + pfree(rawResult); + PG_RETURN_TEXT_P(result); +} + +/* ---------------- + * Functions used to convert TSMap structure into JSONB representation + * ---------------- + */ + +/* + * Convert an integer value into JsonbValue + */ +static JsonbValue * +IntToJsonbValue(int intValue) +{ + char buffer[16]; + JsonbValue *value = palloc0(sizeof(JsonbValue)); + + /* + * String size is based on limit of int capacity up to 12 chars with sign + * and NULL-character + */ + memset(buffer, 0, sizeof(char) * 12); + + pg_ltoa(intValue, buffer); + value->type = jbvNumeric; + value->val.numeric = DatumGetNumeric(DirectFunctionCall3(numeric_in, + CStringGetDatum(buffer), + ObjectIdGetDatum(InvalidOid), + Int32GetDatum(-1) + )); + return value; +} + +/* + * Convert a FTS configuration expression into JsonbValue + */ +static JsonbValue * +TSMapExpressionToJsonbValue(TSMapExpression *expression, JsonbParseState *jsonbState) +{ + JsonbValue key; + JsonbValue *value = NULL; + + pushJsonbValue(&jsonbState, WJB_BEGIN_OBJECT, NULL); + + key.type = jbvString; + key.val.string.len = strlen("operator"); + key.val.string.val = "operator"; + value = IntToJsonbValue(expression->operator); + + pushJsonbValue(&jsonbState, WJB_KEY, &key); + pushJsonbValue(&jsonbState, WJB_VALUE, value); + + key.type = jbvString; + key.val.string.len = strlen("left"); + key.val.string.val = "left"; + + pushJsonbValue(&jsonbState, WJB_KEY, &key); + value = TSMapElementToJsonbValue(expression->left, jsonbState); + if (value && IsAJsonbScalar(value)) + pushJsonbValue(&jsonbState, WJB_VALUE, value); + + key.type = jbvString; + key.val.string.len = strlen("right"); + key.val.string.val = "right"; + + pushJsonbValue(&jsonbState, WJB_KEY, &key); + value = TSMapElementToJsonbValue(expression->right, jsonbState); + if (value && IsAJsonbScalar(value)) + pushJsonbValue(&jsonbState, WJB_VALUE, value); + + return pushJsonbValue(&jsonbState, WJB_END_OBJECT, NULL); +} + +/* + * Convert a FTS configuration case into JsonbValue + */ +static JsonbValue * +TSMapCaseToJsonbValue(TSMapCase *caseObject, JsonbParseState *jsonbState) +{ + JsonbValue key; + JsonbValue *value = NULL; + + pushJsonbValue(&jsonbState, WJB_BEGIN_OBJECT, NULL); + + key.type = jbvString; + key.val.string.len = strlen("condition"); + key.val.string.val = "condition"; + + pushJsonbValue(&jsonbState, WJB_KEY, &key); + value = TSMapElementToJsonbValue(caseObject->condition, jsonbState); + + if (value && IsAJsonbScalar(value)) + pushJsonbValue(&jsonbState, WJB_VALUE, value); + + key.type = jbvString; + key.val.string.len = strlen("command"); + key.val.string.val = "command"; + + pushJsonbValue(&jsonbState, WJB_KEY, &key); + value = TSMapElementToJsonbValue(caseObject->command, jsonbState); + + if (value && IsAJsonbScalar(value)) + pushJsonbValue(&jsonbState, WJB_VALUE, value); + + if (caseObject->elsebranch != NULL) + { + key.type = jbvString; + key.val.string.len = strlen("elsebranch"); + key.val.string.val = "elsebranch"; + + pushJsonbValue(&jsonbState, WJB_KEY, &key); + value = TSMapElementToJsonbValue(caseObject->elsebranch, jsonbState); + + if (value && IsAJsonbScalar(value)) + pushJsonbValue(&jsonbState, WJB_VALUE, value); + } + + key.type = jbvString; + key.val.string.len = strlen("match"); + key.val.string.val = "match"; + + value = IntToJsonbValue(caseObject->match ? 1 : 0); + + pushJsonbValue(&jsonbState, WJB_KEY, &key); + pushJsonbValue(&jsonbState, WJB_VALUE, value); + + return pushJsonbValue(&jsonbState, WJB_END_OBJECT, NULL); +} + +/* + * Convert a FTS KEEP command into JsonbValue + */ +static JsonbValue * +TSMapKeepToJsonbValue(JsonbParseState *jsonbState) +{ + JsonbValue *value = palloc0(sizeof(JsonbValue)); + + value->type = jbvString; + value->val.string.len = strlen("keep"); + value->val.string.val = "keep"; + + return pushJsonbValue(&jsonbState, WJB_VALUE, value); +} + +/* + * Convert a FTS element into JsonbValue. Common point for all types of TSMapElement + */ +JsonbValue * +TSMapElementToJsonbValue(TSMapElement *element, JsonbParseState *jsonbState) +{ + JsonbValue *result = NULL; + + if (element != NULL) + { + switch (element->type) + { + case TSMAP_EXPRESSION: + result = TSMapExpressionToJsonbValue(element->value.objectExpression, jsonbState); + break; + case TSMAP_DICTIONARY: + result = IntToJsonbValue(element->value.objectDictionary); + break; + case TSMAP_CASE: + result = TSMapCaseToJsonbValue(element->value.objectCase, jsonbState); + break; + case TSMAP_KEEP: + result = TSMapKeepToJsonbValue(jsonbState); + break; + default: + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("text search configuration is invalid"), + errdetail("Required text search configuration contains elements with invalid type."))); + break; + } + } + return result; +} + +/* + * Convert a FTS configuration into JSONB + */ +Jsonb * +TSMapToJsonb(TSMapElement *element) +{ + JsonbParseState *jsonbState = NULL; + JsonbValue *out; + Jsonb *result; + + out = TSMapElementToJsonbValue(element, jsonbState); + + result = JsonbValueToJsonb(out); + return result; +} + +/* ---------------- + * Functions used to get TSMap structure from JSONB representation + * ---------------- + */ + +/* + * Extract an integer from JsonbValue + */ +static int +JsonbValueToInt(JsonbValue *value) +{ + char *str; + + str = DatumGetCString(DirectFunctionCall1(numeric_out, NumericGetDatum(value->val.numeric))); + return pg_atoi(str, sizeof(int), 0); +} + +/* + * Check is a key one of FTS configuration case fields + */ +static bool +IsTSMapCaseKey(JsonbValue *value) +{ + /* + * JsonbValue string may be not null-terminated. Convert it for appropriate + * behavior of strcmp function. + */ + char *key = palloc0(sizeof(char) * (value->val.string.len + 1)); + + key[value->val.string.len] = '\0'; + memcpy(key, value->val.string.val, sizeof(char) * value->val.string.len); + return strcmp(key, "match") == 0 || strcmp(key, "condition") == 0 || strcmp(key, "command") == 0 || strcmp(key, "elsebranch") == 0; +} + +/* + * Check is a key one of FTS configuration expression fields + */ +static bool +IsTSMapExpressionKey(JsonbValue *value) +{ + /* + * JsonbValue string may be not null-terminated. Convert it for appropriate + * behavior of strcmp function. + */ + char *key = palloc0(sizeof(char) * (value->val.string.len + 1)); + + key[value->val.string.len] = '\0'; + memcpy(key, value->val.string.val, sizeof(char) * value->val.string.len); + return strcmp(key, "operator") == 0 || strcmp(key, "left") == 0 || strcmp(key, "right") == 0; +} + +/* + * Configure parseData->element according to value (key) + */ +static void +JsonbBeginObjectKey(JsonbValue value, TSMapJsonbParseData *parseData) +{ + TSMapElement *parentElement = parseData->element; + + parseData->element = palloc0(sizeof(TSMapElement)); + parseData->element->parent = parentElement; + + /* Overwrite object-type state based on key */ + if (IsTSMapExpressionKey(&value)) + { + parseData->states[parseData->statesIndex] = TSMPS_READ_EXPRESSION; + parseData->element->type = TSMAP_EXPRESSION; + parseData->element->value.objectExpression = palloc0(sizeof(TSMapExpression)); + } + else if (IsTSMapCaseKey(&value)) + { + parseData->states[parseData->statesIndex] = TSMPS_READ_CASE; + parseData->element->type = TSMAP_CASE; + parseData->element->value.objectExpression = palloc0(sizeof(TSMapCase)); + } +} + +/* + * Process a JsonbValue inside a FTS configuration expression + */ +static void +JsonbKeyExpressionProcessing(JsonbValue value, TSMapJsonbParseData *parseData) +{ + /* + * JsonbValue string may be not null-terminated. Convert it for appropriate + * behavior of strcmp function. + */ + char *key = palloc0(sizeof(char) * (value.val.string.len + 1)); + + memcpy(key, value.val.string.val, sizeof(char) * value.val.string.len); + parseData->statesIndex++; + + if (parseData->statesIndex >= JSONB_PARSE_STATE_STACK_SIZE) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("configuration is too complex to be parsed"), + errdetail("Configurations with more than %d nested objected are not supported.", + JSONB_PARSE_STATE_STACK_SIZE))); + + if (strcmp(key, "operator") == 0) + parseData->states[parseData->statesIndex] = TSMPS_READ_OPERATOR; + else if (strcmp(key, "left") == 0) + parseData->states[parseData->statesIndex] = TSMPS_READ_LEFT; + else if (strcmp(key, "right") == 0) + parseData->states[parseData->statesIndex] = TSMPS_READ_RIGHT; +} + +/* + * Process a JsonbValue inside a FTS configuration case + */ +static void +JsonbKeyCaseProcessing(JsonbValue value, TSMapJsonbParseData *parseData) +{ + /* + * JsonbValue string may be not null-terminated. Convert it for appropriate + * behavior of strcmp function. + */ + char *key = palloc0(sizeof(char) * (value.val.string.len + 1)); + + memcpy(key, value.val.string.val, sizeof(char) * value.val.string.len); + parseData->statesIndex++; + + if (parseData->statesIndex >= JSONB_PARSE_STATE_STACK_SIZE) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("configuration is too complex to be parsed"), + errdetail("Configurations with more than %d nested objected are not supported.", + JSONB_PARSE_STATE_STACK_SIZE))); + + if (strcmp(key, "condition") == 0) + parseData->states[parseData->statesIndex] = TSMPS_READ_CONDITION; + else if (strcmp(key, "command") == 0) + parseData->states[parseData->statesIndex] = TSMPS_READ_COMMAND; + else if (strcmp(key, "elsebranch") == 0) + parseData->states[parseData->statesIndex] = TSMPS_READ_ELSEBRANCH; + else if (strcmp(key, "match") == 0) + parseData->states[parseData->statesIndex] = TSMPS_READ_MATCH; +} + +/* + * Convert a JsonbValue into OID TSMapElement + */ +static TSMapElement * +JsonbValueToOidElement(JsonbValue *value, TSMapElement *parent) +{ + TSMapElement *element = palloc0(sizeof(TSMapElement)); + + element->parent = parent; + element->type = TSMAP_DICTIONARY; + element->value.objectDictionary = JsonbValueToInt(value); + return element; +} + +/* + * Convert a JsonbValue into string TSMapElement. + * Used for special values such as KEEP command + */ +static TSMapElement * +JsonbValueReadString(JsonbValue *value, TSMapElement *parent) +{ + char *str; + TSMapElement *element = palloc0(sizeof(TSMapElement)); + + element->parent = parent; + str = palloc0(sizeof(char) * (value->val.string.len + 1)); + memcpy(str, value->val.string.val, sizeof(char) * value->val.string.len); + + if (strcmp(str, "keep") == 0) + element->type = TSMAP_KEEP; + + pfree(str); + + return element; +} + +/* + * Process a JsonbValue object + */ +static void +JsonbProcessElement(JsonbIteratorToken r, JsonbValue value, TSMapJsonbParseData *parseData) +{ + TSMapElement *element = NULL; + + switch (r) + { + case WJB_KEY: + + /* + * Construct an TSMapElement object. At first key inside JSONB + * object a type is selected based on key. + */ + if (parseData->states[parseData->statesIndex] == TSMPS_READ_COMPLEX_OBJ) + JsonbBeginObjectKey(value, parseData); + + if (parseData->states[parseData->statesIndex] == TSMPS_READ_EXPRESSION) + JsonbKeyExpressionProcessing(value, parseData); + else if (parseData->states[parseData->statesIndex] == TSMPS_READ_CASE) + JsonbKeyCaseProcessing(value, parseData); + + break; + case WJB_BEGIN_OBJECT: + + /* + * Begin construction of new object + */ + parseData->statesIndex++; + parseData->states[parseData->statesIndex] = TSMPS_READ_COMPLEX_OBJ; + break; + case WJB_END_OBJECT: + + /* + * Save constructed object based on current state of parser + */ + if (parseData->states[parseData->statesIndex] == TSMPS_READ_LEFT) + parseData->element->parent->value.objectExpression->left = parseData->element; + else if (parseData->states[parseData->statesIndex] == TSMPS_READ_RIGHT) + parseData->element->parent->value.objectExpression->right = parseData->element; + + else if (parseData->states[parseData->statesIndex] == TSMPS_READ_CONDITION) + parseData->element->parent->value.objectCase->condition = parseData->element; + else if (parseData->states[parseData->statesIndex] == TSMPS_READ_COMMAND) + parseData->element->parent->value.objectCase->command = parseData->element; + else if (parseData->states[parseData->statesIndex] == TSMPS_READ_ELSEBRANCH) + parseData->element->parent->value.objectCase->elsebranch = parseData->element; + + parseData->statesIndex--; + Assert(parseData->statesIndex >= 0); + if (parseData->element->parent != NULL) + parseData->element = parseData->element->parent; + break; + case WJB_VALUE: + + /* + * Save a value inside constructing object + */ + if (value.type == jbvBinary) + element = JsonbToTSMapElement(value.val.binary.data); + else if (value.type == jbvString) + element = JsonbValueReadString(&value, parseData->element); + else if (value.type == jbvNumeric) + element = JsonbValueToOidElement(&value, parseData->element); + else + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("text search configuration is invalid"), + errdetail("Text search configuration contains object with invalid type."))); + + if (parseData->states[parseData->statesIndex] == TSMPS_READ_CONDITION) + parseData->element->value.objectCase->condition = element; + else if (parseData->states[parseData->statesIndex] == TSMPS_READ_COMMAND) + parseData->element->value.objectCase->command = element; + else if (parseData->states[parseData->statesIndex] == TSMPS_READ_ELSEBRANCH) + parseData->element->value.objectCase->elsebranch = element; + else if (parseData->states[parseData->statesIndex] == TSMPS_READ_MATCH) + parseData->element->value.objectCase->match = JsonbValueToInt(&value) == 1 ? true : false; + + else if (parseData->states[parseData->statesIndex] == TSMPS_READ_OPERATOR) + parseData->element->value.objectExpression->operator = JsonbValueToInt(&value); + else if (parseData->states[parseData->statesIndex] == TSMPS_READ_LEFT) + parseData->element->value.objectExpression->left = element; + else if (parseData->states[parseData->statesIndex] == TSMPS_READ_RIGHT) + parseData->element->value.objectExpression->right = element; + + parseData->statesIndex--; + Assert(parseData->statesIndex >= 0); + if (parseData->element->parent != NULL) + parseData->element = parseData->element->parent; + break; + case WJB_ELEM: + + /* + * Store a simple element such as dictionary OID + */ + if (parseData->states[parseData->statesIndex] == TSMPS_WAIT_ELEMENT) + { + if (parseData->element != NULL) + parseData->element = JsonbValueToOidElement(&value, parseData->element->parent); + else + parseData->element = JsonbValueToOidElement(&value, NULL); + } + break; + default: + /* Ignore unused JSONB tokens */ + break; + } +} + +/* + * Convert a JsonbContainer into TSMapElement + */ +static TSMapElement * +JsonbToTSMapElement(JsonbContainer *root) +{ + TSMapJsonbParseData parseData; + JsonbIteratorToken r; + JsonbIterator *it; + JsonbValue val; + + parseData.statesIndex = 0; + parseData.states[parseData.statesIndex] = TSMPS_WAIT_ELEMENT; + parseData.element = NULL; + + it = JsonbIteratorInit(root); + + while ((r = JsonbIteratorNext(&it, &val, true)) != WJB_DONE) + JsonbProcessElement(r, val, &parseData); + + return parseData.element; +} + +/* + * Convert a JSONB into TSMapElement + */ +TSMapElement * +JsonbToTSMap(Jsonb *json) +{ + JsonbContainer *root = &json->root; + + return JsonbToTSMapElement(root); +} + +/* ---------------- + * Text Search Configuration Map Utils + * ---------------- + */ + +/* + * Dynamically extendable list of OIDs + */ +typedef struct OidList +{ + Oid *data; + int size; /* Size of data array. Uninitialized elements + * in data filled with InvalidOid */ +} OidList; + +/* + * Initialize a list + */ +static OidList * +OidListInit() +{ + OidList *result = palloc0(sizeof(OidList)); + + result->size = 1; + result->data = palloc0(result->size * sizeof(Oid)); + result->data[0] = InvalidOid; + return result; +} + +/* + * Add a new OID into list. If it is already stored in list, it won't be add second time. + */ +static void +OidListAdd(OidList *list, Oid oid) +{ + int i; + + /* Search for the Oid in the list */ + for (i = 0; list->data[i] != InvalidOid; i++) + if (list->data[i] == oid) + return; + + /* If not found, insert it in the end of the list */ + if (i >= list->size - 1) + { + int j; + + list->size = list->size * 2; + list->data = repalloc(list->data, sizeof(Oid) * list->size); + + for (j = i; j < list->size; j++) + list->data[j] = InvalidOid; + } + list->data[i] = oid; +} + +/* + * Get OIDs of all dictionaries used in TSMapElement. + * Used for internal recursive calls. + */ +static void +TSMapGetDictionariesInternal(TSMapElement *config, OidList *list) +{ + switch (config->type) + { + case TSMAP_EXPRESSION: + TSMapGetDictionariesInternal(config->value.objectExpression->left, list); + TSMapGetDictionariesInternal(config->value.objectExpression->right, list); + break; + case TSMAP_CASE: + TSMapGetDictionariesInternal(config->value.objectCase->command, list); + TSMapGetDictionariesInternal(config->value.objectCase->condition, list); + if (config->value.objectCase->elsebranch != NULL) + TSMapGetDictionariesInternal(config->value.objectCase->elsebranch, list); + break; + case TSMAP_DICTIONARY: + OidListAdd(list, config->value.objectDictionary); + break; + } +} + +/* + * Get OIDs of all dictionaries used in TSMapElement + */ +Oid * +TSMapGetDictionaries(TSMapElement *config) +{ + Oid *result; + OidList *list = OidListInit(); + + TSMapGetDictionariesInternal(config, list); + + result = list->data; + pfree(list); + + return result; +} + +/* + * Replace one dictionary OID with another in all instances inside a configuration + */ +void +TSMapReplaceDictionary(TSMapElement *config, Oid oldDict, Oid newDict) +{ + switch (config->type) + { + case TSMAP_EXPRESSION: + TSMapReplaceDictionary(config->value.objectExpression->left, oldDict, newDict); + TSMapReplaceDictionary(config->value.objectExpression->right, oldDict, newDict); + break; + case TSMAP_CASE: + TSMapReplaceDictionary(config->value.objectCase->command, oldDict, newDict); + TSMapReplaceDictionary(config->value.objectCase->condition, oldDict, newDict); + if (config->value.objectCase->elsebranch != NULL) + TSMapReplaceDictionary(config->value.objectCase->elsebranch, oldDict, newDict); + break; + case TSMAP_DICTIONARY: + if (config->value.objectDictionary == oldDict) + config->value.objectDictionary = newDict; + break; + } +} + +/* ---------------- + * Text Search Configuration Map Memory Management + * ---------------- + */ + +/* + * Move a FTS configuration expression to another memory context + */ +static TSMapElement * +TSMapExpressionMoveToMemoryContext(TSMapExpression *expression, MemoryContext context) +{ + TSMapElement *result = MemoryContextAlloc(context, sizeof(TSMapElement)); + TSMapExpression *resultExpression = MemoryContextAlloc(context, sizeof(TSMapExpression)); + + memset(resultExpression, 0, sizeof(TSMapExpression)); + result->value.objectExpression = resultExpression; + result->type = TSMAP_EXPRESSION; + + resultExpression->operator = expression->operator; + + resultExpression->left = TSMapMoveToMemoryContext(expression->left, context); + resultExpression->left->parent = result; + + resultExpression->right = TSMapMoveToMemoryContext(expression->right, context); + resultExpression->right->parent = result; + + return result; +} + +/* + * Move a FTS configuration case to another memory context + */ +static TSMapElement * +TSMapCaseMoveToMemoryContext(TSMapCase *caseObject, MemoryContext context) +{ + TSMapElement *result = MemoryContextAlloc(context, sizeof(TSMapElement)); + TSMapCase *resultCaseObject = MemoryContextAlloc(context, sizeof(TSMapCase)); + + memset(resultCaseObject, 0, sizeof(TSMapCase)); + result->value.objectCase = resultCaseObject; + result->type = TSMAP_CASE; + + resultCaseObject->match = caseObject->match; + + resultCaseObject->command = TSMapMoveToMemoryContext(caseObject->command, context); + resultCaseObject->command->parent = result; + + resultCaseObject->condition = TSMapMoveToMemoryContext(caseObject->condition, context); + resultCaseObject->condition->parent = result; + + if (caseObject->elsebranch != NULL) + { + resultCaseObject->elsebranch = TSMapMoveToMemoryContext(caseObject->elsebranch, context); + resultCaseObject->elsebranch->parent = result; + } + + return result; +} + +/* + * Move a FTS configuration to another memory context + */ +TSMapElement * +TSMapMoveToMemoryContext(TSMapElement *config, MemoryContext context) +{ + TSMapElement *result = NULL; + + switch (config->type) + { + case TSMAP_EXPRESSION: + result = TSMapExpressionMoveToMemoryContext(config->value.objectExpression, context); + break; + case TSMAP_CASE: + result = TSMapCaseMoveToMemoryContext(config->value.objectCase, context); + break; + case TSMAP_DICTIONARY: + result = MemoryContextAlloc(context, sizeof(TSMapElement)); + result->type = TSMAP_DICTIONARY; + result->value.objectDictionary = config->value.objectDictionary; + break; + case TSMAP_KEEP: + result = MemoryContextAlloc(context, sizeof(TSMapElement)); + result->type = TSMAP_KEEP; + result->value.object = NULL; + break; + default: + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("text search configuration is invalid"), + errdetail("Text search configuration contains object with invalid type."))); + break; + } + + return result; +} + +/* + * Free memory occupied by FTS configuration expression + */ +static void +TSMapExpressionFree(TSMapExpression *expression) +{ + if (expression->left) + TSMapElementFree(expression->left); + if (expression->right) + TSMapElementFree(expression->right); + pfree(expression); +} + +/* + * Free memory occupied by FTS configuration case + */ +static void +TSMapCaseFree(TSMapCase *caseObject) +{ + TSMapElementFree(caseObject->condition); + TSMapElementFree(caseObject->command); + TSMapElementFree(caseObject->elsebranch); + pfree(caseObject); +} + +/* + * Free memory occupied by FTS configuration element + */ +void +TSMapElementFree(TSMapElement *element) +{ + if (element != NULL) + { + switch (element->type) + { + case TSMAP_CASE: + TSMapCaseFree(element->value.objectCase); + break; + case TSMAP_EXPRESSION: + TSMapExpressionFree(element->value.objectExpression); + break; + } + pfree(element); + } +} + +/* + * Do a deep comparison of two TSMapElements. Doesn't check parents of elements + */ +bool +TSMapElementEquals(TSMapElement *a, TSMapElement *b) +{ + bool result = true; + + if (a->type == b->type) + { + switch (a->type) + { + case TSMAP_CASE: + if (!TSMapElementEquals(a->value.objectCase->condition, b->value.objectCase->condition)) + result = false; + if (!TSMapElementEquals(a->value.objectCase->command, b->value.objectCase->command)) + result = false; + + if (a->value.objectCase->elsebranch != NULL && b->value.objectCase->elsebranch != NULL) + { + if (!TSMapElementEquals(a->value.objectCase->elsebranch, b->value.objectCase->elsebranch)) + result = false; + } + else if (a->value.objectCase->elsebranch != NULL || b->value.objectCase->elsebranch != NULL) + result = false; + + if (a->value.objectCase->match != b->value.objectCase->match) + result = false; + break; + case TSMAP_EXPRESSION: + if (!TSMapElementEquals(a->value.objectExpression->left, b->value.objectExpression->left)) + result = false; + if (!TSMapElementEquals(a->value.objectExpression->right, b->value.objectExpression->right)) + result = false; + if (a->value.objectExpression->operator != b->value.objectExpression->operator) + result = false; + break; + case TSMAP_DICTIONARY: + result = a->value.objectDictionary == b->value.objectDictionary; + break; + case TSMAP_KEEP: + result = true; + } + } + else + result = false; + + return result; +} diff --git a/src/backend/tsearch/ts_parse.c b/src/backend/tsearch/ts_parse.c index 7b69ef5660..f476abb323 100644 --- a/src/backend/tsearch/ts_parse.c +++ b/src/backend/tsearch/ts_parse.c @@ -16,58 +16,157 @@ #include "tsearch/ts_cache.h" #include "tsearch/ts_utils.h" +#include "tsearch/ts_configmap.h" +#include "utils/builtins.h" +#include "funcapi.h" #define IGNORE_LONGLEXEME 1 -/* +/*------------------- * Lexize subsystem + *------------------- */ +/* + * Representation of token produced by FTS parser. It contains intermediate + * lexemes in case of phrase dictionary processing. + */ typedef struct ParsedLex { - int type; - char *lemm; - int lenlemm; - struct ParsedLex *next; + int type; /* Token type */ + char *lemm; /* Token itself */ + int lenlemm; /* Length of the token string */ + int maplen; /* Length of the map */ + bool *accepted; /* Is accepted by some dictionary */ + bool *rejected; /* Is rejected by all dictionaries */ + bool *notFinished; /* Some dictionary not finished processing and + * waits for more tokens */ + struct ParsedLex *next; /* Next token in the list */ + TSMapElement *relatedRule; /* Rule which is used to produce lexemes from + * the token */ } ParsedLex; +/* + * List of tokens produced by FTS parser. + */ typedef struct ListParsedLex { ParsedLex *head; ParsedLex *tail; } ListParsedLex; -typedef struct +/* + * Dictionary state shared between processing of different tokens + */ +typedef struct DictState { - TSConfigCacheEntry *cfg; - Oid curDictId; - int posDict; - DictSubState dictState; - ParsedLex *curSub; - ListParsedLex towork; /* current list to work */ - ListParsedLex waste; /* list of lexemes that already lexized */ + Oid relatedDictionary; /* DictState contains state of dictionary + * with this Oid */ + DictSubState subState; /* Internal state of the dictionary used to + * store some state between dictionary calls */ + ListParsedLex acceptedTokens; /* Tokens which are processed and + * accepted, used in last returned result + * by the dictionary */ + ListParsedLex intermediateTokens; /* Tokens which are not accepted, but + * were processed by thesaurus-like + * dictionary */ + bool storeToAccepted; /* Should current token be appended to + * accepted or intermediate tokens */ + bool processed; /* Is the dictionary take control during + * current token processing */ + TSLexeme *tmpResult; /* Last result returned by thesaurus-like + * dictionary, if dictionary still waiting for + * more lexemes */ +} DictState; - /* - * fields to store last variant to lexize (basically, thesaurus or similar - * to, which wants several lexemes - */ +/* + * List of dictionary states + */ +typedef struct DictStateList +{ + int listLength; + DictState *states; +} DictStateList; - ParsedLex *lastRes; - TSLexeme *tmpRes; +/* + * Buffer entry with lexemes produced from current token + */ +typedef struct LexemesBufferEntry +{ + TSMapElement *key; /* Element of the mapping configuration produced the entry */ + ParsedLex *token; /* Token used for production of the lexemes */ + TSLexeme *data; /* Lexemes produced from current token */ +} LexemesBufferEntry; + +/* + * Buffer with lexemes produced from current token + */ +typedef struct LexemesBuffer +{ + int size; + LexemesBufferEntry *data; +} LexemesBuffer; + +/* + * Storage for accepted and possible accepted lexemes + */ +typedef struct ResultStorage +{ + TSLexeme *lexemes; /* Processed lexemes, which is not yet + * accepted */ + TSLexeme *accepted; /* Already accepted lexemes */ +} ResultStorage; + +/* + * FTS processing context + */ +typedef struct LexizeData +{ + TSConfigCacheEntry *cfg; /* Text search configuration mappings for + * current configuration */ + DictStateList dslist; /* List of all currently stored states of + * dictionaries */ + ListParsedLex towork; /* Current list to work */ + ListParsedLex waste; /* List of lexemes that already lexized */ + LexemesBuffer buffer; /* Buffer of processed lexemes. Used to avoid + * multiple execution of token lexize process + * with same parameters */ + ResultStorage delayedResults; /* Results that should be returned but may + * be rejected in future */ + Oid skipDictionary; /* The dictionary we should skip during + * processing. Used to avoid infinite loop in + * configuration with phrase dictionary */ + bool debugContext; /* If true, relatedRule attribute is filled */ } LexizeData; -static void -LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg) +/* + * FTS processing debug context. Used during ts_debug calls. + */ +typedef struct TSDebugContext { - ld->cfg = cfg; - ld->curDictId = InvalidOid; - ld->posDict = 0; - ld->towork.head = ld->towork.tail = ld->curSub = NULL; - ld->waste.head = ld->waste.tail = NULL; - ld->lastRes = NULL; - ld->tmpRes = NULL; -} + TSConfigCacheEntry *cfg; /* Text search configuration mappings for + * current configuration */ + TSParserCacheEntry *prsobj; /* Parser context of current ts_debug context */ + LexDescr *tokenTypes; /* Token types supported by current parser */ + void *prsdata; /* Parser data of current ts_debug context */ + LexizeData ldata; /* Lexize data of current ts_debug context */ + int tokentype; /* Last token tokentype */ + TSLexeme *savedLexemes; /* Last token lexemes stored for ts_debug + * output */ + ParsedLex *leftTokens; /* Corresponded ParsedLex */ +} TSDebugContext; + +static TSLexeme *TSLexemeMap(LexizeData *ld, ParsedLex *token, TSMapExpression *expression); +static TSLexeme *LexizeExecTSElement(LexizeData *ld, ParsedLex *token, TSMapElement *config); + +/*------------------- + * ListParsedLex API + *------------------- + */ +/* + * Add a ParsedLex to the end of the list + */ static void LPLAddTail(ListParsedLex *list, ParsedLex *newpl) { @@ -81,274 +180,1291 @@ LPLAddTail(ListParsedLex *list, ParsedLex *newpl) newpl->next = NULL; } -static ParsedLex * -LPLRemoveHead(ListParsedLex *list) -{ - ParsedLex *res = list->head; +/* + * Add a copy of ParsedLex to the end of the list + */ +static void +LPLAddTailCopy(ListParsedLex *list, ParsedLex *newpl) +{ + ParsedLex *copy = palloc0(sizeof(ParsedLex)); + + copy->lenlemm = newpl->lenlemm; + copy->type = newpl->type; + copy->lemm = newpl->lemm; + copy->relatedRule = newpl->relatedRule; + copy->next = NULL; + + if (list->tail) + { + list->tail->next = copy; + list->tail = copy; + } + else + list->head = list->tail = copy; +} + +/* + * Remove the head of the list. Return pointer to detached head + */ +static ParsedLex * +LPLRemoveHead(ListParsedLex *list) +{ + ParsedLex *res = list->head; + + if (list->head) + list->head = list->head->next; + + if (list->head == NULL) + list->tail = NULL; + + return res; +} + +/* + * Remove all ParsedLex from the list + */ +static void +LPLClear(ListParsedLex *list) +{ + ParsedLex *tmp, + *ptr = list->head; + + while (ptr) + { + tmp = ptr->next; + pfree(ptr); + ptr = tmp; + } + + list->head = list->tail = NULL; +} + +/*------------------- + * LexizeData manipulation functions + *------------------- + */ + +/* + * Initialize empty LexizeData object + */ +static void +LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg) +{ + ld->cfg = cfg; + ld->skipDictionary = InvalidOid; + ld->towork.head = ld->towork.tail = NULL; + ld->waste.head = ld->waste.tail = NULL; + ld->dslist.listLength = 0; + ld->dslist.states = NULL; + ld->buffer.size = 0; + ld->buffer.data = NULL; + ld->delayedResults.lexemes = NULL; + ld->delayedResults.accepted = NULL; +} + +/* + * Add a token to the processing queue + */ +static void +LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm) +{ + ParsedLex *newpl = (ParsedLex *) palloc(sizeof(ParsedLex)); + + newpl->type = type; + newpl->lemm = lemm; + newpl->lenlemm = lenlemm; + newpl->relatedRule = NULL; + LPLAddTail(&ld->towork, newpl); +} + +/* + * Remove head of the processing queue + */ +static void +RemoveHead(LexizeData *ld) +{ + LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork)); +} + +/* + * Set token corresponded to current lexeme + */ +static void +setCorrLex(LexizeData *ld, ParsedLex **correspondLexem) +{ + if (correspondLexem) + *correspondLexem = ld->waste.head; + else + LPLClear(&ld->waste); + + ld->waste.head = ld->waste.tail = NULL; +} + +/*------------------- + * DictState manipulation functions + *------------------- + */ + +/* + * Get a state of dictionary based on its OID + */ +static DictState * +DictStateListGet(DictStateList *list, Oid dictId) +{ + int i; + DictState *result = NULL; + + for (i = 0; i < list->listLength; i++) + if (list->states[i].relatedDictionary == dictId) + result = &list->states[i]; + + return result; +} + +/* + * Remove a state of dictionary based on its OID + */ +static void +DictStateListRemove(DictStateList *list, Oid dictId) +{ + int i; + + for (i = 0; i < list->listLength; i++) + if (list->states[i].relatedDictionary == dictId) + break; + + if (i != list->listLength) + { + memcpy(list->states + i, list->states + i + 1, sizeof(DictState) * (list->listLength - i - 1)); + list->listLength--; + if (list->listLength == 0) + list->states = NULL; + else + list->states = repalloc(list->states, sizeof(DictState) * list->listLength); + } +} + +/* + * Insert a state of dictionary with specified OID + */ +static DictState * +DictStateListAdd(DictStateList *list, DictState *state) +{ + DictStateListRemove(list, state->relatedDictionary); + + list->listLength++; + if (list->states) + list->states = repalloc(list->states, sizeof(DictState) * list->listLength); + else + list->states = palloc0(sizeof(DictState) * list->listLength); + + memcpy(list->states + list->listLength - 1, state, sizeof(DictState)); + + return list->states + list->listLength - 1; +} + +/* + * Remove states of all dictionaries + */ +static void +DictStateListClear(DictStateList *list) +{ + list->listLength = 0; + if (list->states) + pfree(list->states); + list->states = NULL; +} + +/*------------------- + * LexemesBuffer manipulation functions + *------------------- + */ + +/* + * Check if there is a saved lexeme generated by specified TSMapElement + */ +static bool +LexemesBufferContains(LexemesBuffer *buffer, TSMapElement *key, ParsedLex *token) +{ + int i; + + for (i = 0; i < buffer->size; i++) + if (TSMapElementEquals(buffer->data[i].key, key) && buffer->data[i].token == token) + return true; + + return false; +} + +/* + * Get a saved lexeme generated by specified TSMapElement + */ +static TSLexeme * +LexemesBufferGet(LexemesBuffer *buffer, TSMapElement *key, ParsedLex *token) +{ + int i; + TSLexeme *result = NULL; + + for (i = 0; i < buffer->size; i++) + if (TSMapElementEquals(buffer->data[i].key, key) && buffer->data[i].token == token) + result = buffer->data[i].data; + + return result; +} + +/* + * Remove a saved lexeme generated by specified TSMapElement + */ +static void +LexemesBufferRemove(LexemesBuffer *buffer, TSMapElement *key, ParsedLex *token) +{ + int i; + + for (i = 0; i < buffer->size; i++) + if (TSMapElementEquals(buffer->data[i].key, key) && buffer->data[i].token == token) + break; + + if (i != buffer->size) + { + memcpy(buffer->data + i, buffer->data + i + 1, sizeof(LexemesBufferEntry) * (buffer->size - i - 1)); + buffer->size--; + if (buffer->size == 0) + buffer->data = NULL; + else + buffer->data = repalloc(buffer->data, sizeof(LexemesBufferEntry) * buffer->size); + } +} + +/* + * Same a lexeme generated by specified TSMapElement + */ +static void +LexemesBufferAdd(LexemesBuffer *buffer, TSMapElement *key, ParsedLex *token, TSLexeme *data) +{ + LexemesBufferRemove(buffer, key, token); + + buffer->size++; + if (buffer->data) + buffer->data = repalloc(buffer->data, sizeof(LexemesBufferEntry) * buffer->size); + else + buffer->data = palloc0(sizeof(LexemesBufferEntry) * buffer->size); + + buffer->data[buffer->size - 1].token = token; + buffer->data[buffer->size - 1].key = key; + buffer->data[buffer->size - 1].data = data; +} + +/* + * Remove all lexemes saved in a buffer + */ +static void +LexemesBufferClear(LexemesBuffer *buffer) +{ + int i; + bool *skipEntry = palloc0(sizeof(bool) * buffer->size); + + for (i = 0; i < buffer->size; i++) + { + if (buffer->data[i].data != NULL && !skipEntry[i]) + { + int j; + + for (j = 0; j < buffer->size; j++) + if (buffer->data[i].data == buffer->data[j].data) + skipEntry[j] = true; + + pfree(buffer->data[i].data); + } + } + + buffer->size = 0; + if (buffer->data) + pfree(buffer->data); + buffer->data = NULL; +} + +/*------------------- + * TSLexeme util functions + *------------------- + */ + +/* + * Get size of TSLexeme except empty-lexeme + */ +static int +TSLexemeGetSize(TSLexeme *lex) +{ + int result = 0; + TSLexeme *ptr = lex; + + while (ptr && ptr->lexeme) + { + result++; + ptr++; + } + + return result; +} + +/* + * Remove repeated lexemes. Also remove copies of whole nvariant groups. + */ +static TSLexeme * +TSLexemeRemoveDuplications(TSLexeme *lexeme) +{ + TSLexeme *res; + int curLexIndex; + int i; + int lexemeSize = TSLexemeGetSize(lexeme); + int shouldCopyCount = lexemeSize; + bool *shouldCopy; + + if (lexeme == NULL) + return NULL; + + shouldCopy = palloc(sizeof(bool) * lexemeSize); + memset(shouldCopy, true, sizeof(bool) * lexemeSize); + + for (curLexIndex = 0; curLexIndex < lexemeSize; curLexIndex++) + { + for (i = curLexIndex + 1; i < lexemeSize; i++) + { + if (!shouldCopy[i]) + continue; + + if (strcmp(lexeme[curLexIndex].lexeme, lexeme[i].lexeme) == 0) + { + if (lexeme[curLexIndex].nvariant == lexeme[i].nvariant) + { + shouldCopy[i] = false; + shouldCopyCount--; + continue; + } + else + { + /* + * Check for same set of lexemes in another nvariant + * series + */ + int nvariantCountL = 0; + int nvariantCountR = 0; + int nvariantOverlap = 1; + int j; + + for (j = 0; j < lexemeSize; j++) + if (lexeme[curLexIndex].nvariant == lexeme[j].nvariant) + nvariantCountL++; + for (j = 0; j < lexemeSize; j++) + if (lexeme[i].nvariant == lexeme[j].nvariant) + nvariantCountR++; + + if (nvariantCountL != nvariantCountR) + continue; + + for (j = 1; j < nvariantCountR; j++) + { + if (strcmp(lexeme[curLexIndex + j].lexeme, lexeme[i + j].lexeme) == 0 + && lexeme[curLexIndex + j].nvariant == lexeme[i + j].nvariant) + nvariantOverlap++; + } + + if (nvariantOverlap != nvariantCountR) + continue; + + for (j = 0; j < nvariantCountR; j++) + shouldCopy[i + j] = false; + } + } + } + } + + res = palloc0(sizeof(TSLexeme) * (shouldCopyCount + 1)); + + for (i = 0, curLexIndex = 0; curLexIndex < lexemeSize; curLexIndex++) + { + if (shouldCopy[curLexIndex]) + { + memcpy(res + i, lexeme + curLexIndex, sizeof(TSLexeme)); + i++; + } + } + + pfree(shouldCopy); + return res; +} + +/* + * Combine two lexeme lists with respect to positions + */ +static TSLexeme * +TSLexemeMergePositions(TSLexeme *left, TSLexeme *right) +{ + TSLexeme *result = NULL; + + if (left != NULL || right != NULL) + { + int left_i = 0; + int right_i = 0; + int left_max_nvariant = 0; + int i; + int left_size = TSLexemeGetSize(left); + int right_size = TSLexemeGetSize(right); + + result = palloc0(sizeof(TSLexeme) * (left_size + right_size + 1)); + + for (i = 0; i < left_size; i++) + if (left[i].nvariant > left_max_nvariant) + left_max_nvariant = left[i].nvariant; + + for (i = 0; i < right_size; i++) + right[i].nvariant += left_max_nvariant; + if (right && right[0].flags & TSL_ADDPOS) + right[0].flags &= ~TSL_ADDPOS; + + i = 0; + while (i < left_size + right_size) + { + if (left_i < left_size) + { + do + { + result[i++] = left[left_i++]; + } while (left && left[left_i].lexeme && (left[left_i].flags & TSL_ADDPOS) == 0); + } + + if (right_i < right_size) + { + do + { + result[i++] = right[right_i++]; + } while (right && right[right_i].lexeme && (right[right_i].flags & TSL_ADDPOS) == 0); + } + } + } + return result; +} + +/* + * Split lexemes generated by regular dictionaries and multi-input dictionaries + * and combine them with respect to positions + */ +static TSLexeme * +TSLexemeFilterMulti(TSLexeme *lexemes) +{ + TSLexeme *result; + TSLexeme *ptr = lexemes; + int multi_lexemes = 0; + + while (ptr && ptr->lexeme) + { + if (ptr->flags & TSL_MULTI) + multi_lexemes++; + ptr++; + } + + if (multi_lexemes > 0) + { + TSLexeme *lexemes_multi = palloc0(sizeof(TSLexeme) * (multi_lexemes + 1)); + TSLexeme *lexemes_rest = palloc0(sizeof(TSLexeme) * (TSLexemeGetSize(lexemes) - multi_lexemes + 1)); + int rest_i = 0; + int multi_i = 0; + + ptr = lexemes; + while (ptr && ptr->lexeme) + { + if (ptr->flags & TSL_MULTI) + lexemes_multi[multi_i++] = *ptr; + else + lexemes_rest[rest_i++] = *ptr; + + ptr++; + } + result = TSLexemeMergePositions(lexemes_rest, lexemes_multi); + } + else + { + result = TSLexemeMergePositions(lexemes, NULL); + } + + return result; +} + +/* + * Mark lexemes as generated by multi-input (thesaurus-like) dictionary + */ +static void +TSLexemeMarkMulti(TSLexeme *lexemes) +{ + TSLexeme *ptr = lexemes; + + while (ptr && ptr->lexeme) + { + ptr->flags |= TSL_MULTI; + ptr++; + } +} + +/*------------------- + * Lexemes set operations + *------------------- + */ + +/* + * Combine left and right lexeme lists into one. + * If append is true, right lexemes added after last left lexeme with TSL_ADDPOS flag + */ +static TSLexeme * +TSLexemeUnionOpt(TSLexeme *left, TSLexeme *right, bool append) +{ + TSLexeme *result; + int left_size = TSLexemeGetSize(left); + int right_size = TSLexemeGetSize(right); + int left_max_nvariant = 0; + int i; + + if (left == NULL && right == NULL) + { + result = NULL; + } + else + { + result = palloc0(sizeof(TSLexeme) * (left_size + right_size + 1)); + + for (i = 0; i < left_size; i++) + if (left[i].nvariant > left_max_nvariant) + left_max_nvariant = left[i].nvariant; + + if (left_size > 0) + memcpy(result, left, sizeof(TSLexeme) * left_size); + if (right_size > 0) + memcpy(result + left_size, right, sizeof(TSLexeme) * right_size); + if (append && left_size > 0 && right_size > 0) + result[left_size].flags |= TSL_ADDPOS; + + for (i = left_size; i < left_size + right_size; i++) + result[i].nvariant += left_max_nvariant; + } + + return result; +} + +/* + * Combine left and right lexeme lists into one + */ +static TSLexeme * +TSLexemeUnion(TSLexeme *left, TSLexeme *right) +{ + return TSLexemeUnionOpt(left, right, false); +} + +/* + * Remove common lexemes and return only which is stored in left list + */ +static TSLexeme * +TSLexemeExcept(TSLexeme *left, TSLexeme *right) +{ + TSLexeme *result = NULL; + int i, + j, + k; + int left_size = TSLexemeGetSize(left); + int right_size = TSLexemeGetSize(right); + + result = palloc0(sizeof(TSLexeme) * (left_size + 1)); + + for (k = 0, i = 0; i < left_size; i++) + { + bool found = false; + + for (j = 0; j < right_size; j++) + if (strcmp(left[i].lexeme, right[j].lexeme) == 0) + found = true; + + if (!found) + result[k++] = left[i]; + } + + return result; +} + +/* + * Keep only common lexemes + */ +static TSLexeme * +TSLexemeIntersect(TSLexeme *left, TSLexeme *right) +{ + TSLexeme *result = NULL; + int i, + j, + k; + int left_size = TSLexemeGetSize(left); + int right_size = TSLexemeGetSize(right); + + result = palloc0(sizeof(TSLexeme) * (left_size + 1)); + + for (k = 0, i = 0; i < left_size; i++) + { + bool found = false; + + for (j = 0; j < right_size; j++) + if (strcmp(left[i].lexeme, right[j].lexeme) == 0) + found = true; + + if (found) + result[k++] = left[i]; + } + + return result; +} + +/*------------------- + * Result storage functions + *------------------- + */ + +/* + * Add a lexeme to the result storage + */ +static void +ResultStorageAdd(ResultStorage *storage, ParsedLex *token, TSLexeme *lexs) +{ + TSLexeme *oldLexs = storage->lexemes; + + storage->lexemes = TSLexemeUnionOpt(storage->lexemes, lexs, true); + if (oldLexs) + pfree(oldLexs); +} + +/* + * Move all saved lexemes to accepted list + */ +static void +ResultStorageMoveToAccepted(ResultStorage *storage) +{ + if (storage->accepted) + { + TSLexeme *prevAccepted = storage->accepted; + + storage->accepted = TSLexemeUnionOpt(storage->accepted, storage->lexemes, true); + if (prevAccepted) + pfree(prevAccepted); + if (storage->lexemes) + pfree(storage->lexemes); + } + else + { + storage->accepted = storage->lexemes; + } + storage->lexemes = NULL; +} + +/* + * Remove all non-accepted lexemes + */ +static void +ResultStorageClearLexemes(ResultStorage *storage) +{ + if (storage->lexemes) + pfree(storage->lexemes); + storage->lexemes = NULL; +} + +/* + * Remove all accepted lexemes + */ +static void +ResultStorageClearAccepted(ResultStorage *storage) +{ + if (storage->accepted) + pfree(storage->accepted); + storage->accepted = NULL; +} + +/*------------------- + * Condition and command execution + *------------------- + */ + +/* + * Process a token by the dictionary + */ +static TSLexeme * +LexizeExecDictionary(LexizeData *ld, ParsedLex *token, TSMapElement *dictionary) +{ + TSLexeme *res; + TSDictionaryCacheEntry *dict; + DictSubState subState; + Oid dictId = dictionary->value.objectDictionary; + + if (ld->skipDictionary == dictId) + return NULL; + + if (LexemesBufferContains(&ld->buffer, dictionary, token)) + res = LexemesBufferGet(&ld->buffer, dictionary, token); + else + { + char *curValLemm = token->lemm; + int curValLenLemm = token->lenlemm; + DictState *state = DictStateListGet(&ld->dslist, dictId); + + dict = lookup_ts_dictionary_cache(dictId); + + if (state) + { + subState = state->subState; + state->processed = true; + } + else + { + subState.isend = subState.getnext = false; + subState.private_state = NULL; + } + + res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize), + PointerGetDatum(dict->dictData), + PointerGetDatum(curValLemm), + Int32GetDatum(curValLenLemm), + PointerGetDatum(&subState) + )); + + if (subState.getnext) + { + /* + * Dictionary wants next word, so store current context and state + * in the DictStateList + */ + if (state == NULL) + { + state = palloc0(sizeof(DictState)); + state->processed = true; + state->relatedDictionary = dictId; + state->intermediateTokens.head = state->intermediateTokens.tail = NULL; + state->acceptedTokens.head = state->acceptedTokens.tail = NULL; + state->tmpResult = NULL; + + /* + * Add state to the list and update pointer in order to work + * with copy from the list + */ + state = DictStateListAdd(&ld->dslist, state); + } + + state->subState = subState; + state->storeToAccepted = res != NULL; + + if (res) + { + if (state->intermediateTokens.head != NULL) + { + ParsedLex *ptr = state->intermediateTokens.head; + + while (ptr) + { + LPLAddTailCopy(&state->acceptedTokens, ptr); + ptr = ptr->next; + } + state->intermediateTokens.head = state->intermediateTokens.tail = NULL; + } + + if (state->tmpResult) + pfree(state->tmpResult); + TSLexemeMarkMulti(res); + state->tmpResult = res; + res = NULL; + } + } + else if (state != NULL) + { + if (res) + { + if (state) + TSLexemeMarkMulti(res); + DictStateListRemove(&ld->dslist, dictId); + } + else + { + /* + * Trigger post-processing in order to check tmpResult and + * restart processing (see LexizeExec function) + */ + state->processed = false; + } + } + LexemesBufferAdd(&ld->buffer, dictionary, token, res); + } + + return res; +} + +/* + * Check is dictionary waits for more tokens or not + */ +static bool +LexizeExecDictionaryWaitNext(LexizeData *ld, Oid dictId) +{ + DictState *state = DictStateListGet(&ld->dslist, dictId); + + if (state) + return state->subState.getnext; + else + return false; +} + +/* + * Check is dictionary result for current token is NULL or not. + * It dictionary waits for more lexemes, the result is interpreted as not null. + */ +static bool +LexizeExecIsNull(LexizeData *ld, ParsedLex *token, TSMapElement *config) +{ + bool result = false; + + if (config->type == TSMAP_EXPRESSION) + { + TSMapExpression *expression = config->value.objectExpression; + + result = LexizeExecIsNull(ld, token, expression->left) || LexizeExecIsNull(ld, token, expression->right); + } + else if (config->type == TSMAP_DICTIONARY) + { + Oid dictOid = config->value.objectDictionary; + TSLexeme *lexemes = LexizeExecDictionary(ld, token, config); + + if (lexemes) + result = false; + else + result = !LexizeExecDictionaryWaitNext(ld, dictOid); + } + return result; +} + +/* + * Execute a MAP operator + */ +static TSLexeme * +TSLexemeMap(LexizeData *ld, ParsedLex *token, TSMapExpression *expression) +{ + TSLexeme *left_res; + TSLexeme *result = NULL; + int left_size; + int i; + + left_res = LexizeExecTSElement(ld, token, expression->left); + left_size = TSLexemeGetSize(left_res); + + if (left_res == NULL && LexizeExecIsNull(ld, token, expression->left)) + result = LexizeExecTSElement(ld, token, expression->right); + else if (expression->operator == TSMAP_OP_COMMA && + ((left_res != NULL && (left_res->flags & TSL_FILTER) == 0) || left_res == NULL)) + result = left_res; + else + { + TSMapElement *relatedRuleTmp = NULL; + relatedRuleTmp = palloc0(sizeof(TSMapElement)); + relatedRuleTmp->parent = NULL; + relatedRuleTmp->type = TSMAP_EXPRESSION; + relatedRuleTmp->value.objectExpression = palloc0(sizeof(TSMapExpression)); + relatedRuleTmp->value.objectExpression->operator = expression->operator; + relatedRuleTmp->value.objectExpression->left = token->relatedRule; + + for (i = 0; i < left_size; i++) + { + TSLexeme *tmp_res = NULL; + TSLexeme *prev_res; + ParsedLex tmp_token; + + tmp_token.lemm = left_res[i].lexeme; + tmp_token.lenlemm = strlen(left_res[i].lexeme); + tmp_token.type = token->type; + tmp_token.next = NULL; + + tmp_res = LexizeExecTSElement(ld, &tmp_token, expression->right); + relatedRuleTmp->value.objectExpression->right = tmp_token.relatedRule; + prev_res = result; + result = TSLexemeUnion(prev_res, tmp_res); + if (prev_res) + pfree(prev_res); + } + token->relatedRule = relatedRuleTmp; + } + + return result; +} + +/* + * Execute a TSMapElement + * Common point of all possible types of TSMapElement + */ +static TSLexeme * +LexizeExecTSElement(LexizeData *ld, ParsedLex *token, TSMapElement *config) +{ + TSLexeme *result = NULL; + + if (LexemesBufferContains(&ld->buffer, config, token)) + { + if (ld->debugContext) + token->relatedRule = config; + result = LexemesBufferGet(&ld->buffer, config, token); + } + else if (config->type == TSMAP_DICTIONARY) + { + if (ld->debugContext) + token->relatedRule = config; + result = LexizeExecDictionary(ld, token, config); + } + else if (config->type == TSMAP_CASE) + { + TSMapCase *caseObject = config->value.objectCase; + bool conditionIsNull = LexizeExecIsNull(ld, token, caseObject->condition); + + if ((!conditionIsNull && caseObject->match) || (conditionIsNull && !caseObject->match)) + { + if (caseObject->command->type == TSMAP_KEEP) + result = LexizeExecTSElement(ld, token, caseObject->condition); + else + result = LexizeExecTSElement(ld, token, caseObject->command); + } + else if (caseObject->elsebranch) + result = LexizeExecTSElement(ld, token, caseObject->elsebranch); + } + else if (config->type == TSMAP_EXPRESSION) + { + TSLexeme *resLeft = NULL; + TSLexeme *resRight = NULL; + TSMapElement *relatedRuleTmp = NULL; + TSMapExpression *expression = config->value.objectExpression; + + if (expression->operator != TSMAP_OP_MAP && expression->operator != TSMAP_OP_COMMA) + { + if (ld->debugContext) + { + relatedRuleTmp = palloc0(sizeof(TSMapElement)); + relatedRuleTmp->parent = NULL; + relatedRuleTmp->type = TSMAP_EXPRESSION; + relatedRuleTmp->value.objectExpression = palloc0(sizeof(TSMapExpression)); + relatedRuleTmp->value.objectExpression->operator = expression->operator; + } - if (list->head) - list->head = list->head->next; + resLeft = LexizeExecTSElement(ld, token, expression->left); + if (ld->debugContext) + relatedRuleTmp->value.objectExpression->left = token->relatedRule; - if (list->head == NULL) - list->tail = NULL; + resRight = LexizeExecTSElement(ld, token, expression->right); + if (ld->debugContext) + relatedRuleTmp->value.objectExpression->right = token->relatedRule; + } - return res; -} + switch (expression->operator) + { + case TSMAP_OP_UNION: + result = TSLexemeUnion(resLeft, resRight); + break; + case TSMAP_OP_EXCEPT: + result = TSLexemeExcept(resLeft, resRight); + break; + case TSMAP_OP_INTERSECT: + result = TSLexemeIntersect(resLeft, resRight); + break; + case TSMAP_OP_MAP: + case TSMAP_OP_COMMA: + result = TSLexemeMap(ld, token, expression); + break; + default: + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("text search configuration is invalid"), + errdetail("Text search configuration contains invalid expression operator."))); + break; + } -static void -LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm) -{ - ParsedLex *newpl = (ParsedLex *) palloc(sizeof(ParsedLex)); + if (ld->debugContext && relatedRuleTmp != NULL) + token->relatedRule = relatedRuleTmp; + } - newpl->type = type; - newpl->lemm = lemm; - newpl->lenlemm = lenlemm; - LPLAddTail(&ld->towork, newpl); - ld->curSub = ld->towork.tail; + if (!LexemesBufferContains(&ld->buffer, config, token)) + LexemesBufferAdd(&ld->buffer, config, token, result); + + return result; } -static void -RemoveHead(LexizeData *ld) +/*------------------- + * LexizeExec and helpers functions + *------------------- + */ + +/* + * Processing of EOF-like token. + * Return all temporary results if any are saved. + */ +static TSLexeme * +LexizeExecFinishProcessing(LexizeData *ld) { - LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork)); + int i; + TSLexeme *res = NULL; + + for (i = 0; i < ld->dslist.listLength; i++) + { + TSLexeme *last_res = res; - ld->posDict = 0; + res = TSLexemeUnion(res, ld->dslist.states[i].tmpResult); + if (last_res) + pfree(last_res); + } + + return res; } -static void -setCorrLex(LexizeData *ld, ParsedLex **correspondLexem) +/* + * Get last accepted result of the phrase-dictionary + */ +static TSLexeme * +LexizeExecGetPreviousResults(LexizeData *ld) { - if (correspondLexem) - { - *correspondLexem = ld->waste.head; - } - else - { - ParsedLex *tmp, - *ptr = ld->waste.head; + int i; + TSLexeme *res = NULL; - while (ptr) + for (i = 0; i < ld->dslist.listLength; i++) + { + if (!ld->dslist.states[i].processed) { - tmp = ptr->next; - pfree(ptr); - ptr = tmp; + TSLexeme *last_res = res; + + res = TSLexemeUnion(res, ld->dslist.states[i].tmpResult); + if (last_res) + pfree(last_res); } } - ld->waste.head = ld->waste.tail = NULL; + + return res; } +/* + * Remove all dictionary states which wasn't used for current token + */ static void -moveToWaste(LexizeData *ld, ParsedLex *stop) +LexizeExecClearDictStates(LexizeData *ld) { - bool go = true; + int i; - while (ld->towork.head && go) + for (i = 0; i < ld->dslist.listLength; i++) { - if (ld->towork.head == stop) + if (!ld->dslist.states[i].processed) { - ld->curSub = stop->next; - go = false; + DictStateListRemove(&ld->dslist, ld->dslist.states[i].relatedDictionary); + i = 0; } - RemoveHead(ld); } } -static void -setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res) +/* + * Check if there are any dictionaries that didn't processed current token + */ +static bool +LexizeExecNotProcessedDictStates(LexizeData *ld) { - if (ld->tmpRes) - { - TSLexeme *ptr; + int i; - for (ptr = ld->tmpRes; ptr->lexeme; ptr++) - pfree(ptr->lexeme); - pfree(ld->tmpRes); - } - ld->tmpRes = res; - ld->lastRes = lex; + for (i = 0; i < ld->dslist.listLength; i++) + if (!ld->dslist.states[i].processed) + return true; + + return false; } +/* + * Do a lexize processing for a towork queue in LexizeData + */ static TSLexeme * LexizeExec(LexizeData *ld, ParsedLex **correspondLexem) { + ParsedLex *token; + TSMapElement *config; + TSLexeme *res = NULL; + TSLexeme *prevIterationResult = NULL; + bool removeHead = false; + bool resetSkipDictionary = false; + bool accepted = false; int i; - ListDictionary *map; - TSDictionaryCacheEntry *dict; - TSLexeme *res; - if (ld->curDictId == InvalidOid) + for (i = 0; i < ld->dslist.listLength; i++) + ld->dslist.states[i].processed = false; + if (ld->skipDictionary != InvalidOid) + resetSkipDictionary = true; + + token = ld->towork.head; + if (token == NULL) { - /* - * usual mode: dictionary wants only one word, but we should keep in - * mind that we should go through all stack - */ + setCorrLex(ld, correspondLexem); + return NULL; + } - while (ld->towork.head) + if (token->type >= ld->cfg->lenmap) + { + removeHead = true; + } + else + { + config = ld->cfg->map[token->type]; + if (config != NULL) + { + res = LexizeExecTSElement(ld, token, config); + prevIterationResult = LexizeExecGetPreviousResults(ld); + removeHead = prevIterationResult == NULL; + } + else { - ParsedLex *curVal = ld->towork.head; - char *curValLemm = curVal->lemm; - int curValLenLemm = curVal->lenlemm; + removeHead = true; + if (token->type == 0) /* Processing EOF-like token */ + { + res = LexizeExecFinishProcessing(ld); + prevIterationResult = NULL; + } + } - map = ld->cfg->map + curVal->type; + if (LexizeExecNotProcessedDictStates(ld) && (token->type == 0 || config != NULL)) /* Rollback processing */ + { + int i; + ListParsedLex *intermediateTokens = NULL; + ListParsedLex *acceptedTokens = NULL; - if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0) + for (i = 0; i < ld->dslist.listLength; i++) { - /* skip this type of lexeme */ - RemoveHead(ld); - continue; + if (!ld->dslist.states[i].processed) + { + intermediateTokens = &ld->dslist.states[i].intermediateTokens; + acceptedTokens = &ld->dslist.states[i].acceptedTokens; + if (prevIterationResult == NULL) + ld->skipDictionary = ld->dslist.states[i].relatedDictionary; + } } - for (i = ld->posDict; i < map->len; i++) + if (intermediateTokens && intermediateTokens->head) { - dict = lookup_ts_dictionary_cache(map->dictIds[i]); - - ld->dictState.isend = ld->dictState.getnext = false; - ld->dictState.private_state = NULL; - res = (TSLexeme *) DatumGetPointer(FunctionCall4( - &(dict->lexize), - PointerGetDatum(dict->dictData), - PointerGetDatum(curValLemm), - Int32GetDatum(curValLenLemm), - PointerGetDatum(&ld->dictState) - )); - - if (ld->dictState.getnext) + ParsedLex *head = ld->towork.head; + + ld->towork.head = intermediateTokens->head; + intermediateTokens->tail->next = head; + head->next = NULL; + ld->towork.tail = head; + removeHead = false; + LPLClear(&ld->waste); + if (acceptedTokens && acceptedTokens->head) { - /* - * dictionary wants next word, so setup and store current - * position and go to multiword mode - */ - - ld->curDictId = DatumGetObjectId(map->dictIds[i]); - ld->posDict = i + 1; - ld->curSub = curVal->next; - if (res) - setNewTmpRes(ld, curVal, res); - return LexizeExec(ld, correspondLexem); + ld->waste.head = acceptedTokens->head; + ld->waste.tail = acceptedTokens->tail; } + } + ResultStorageClearLexemes(&ld->delayedResults); + if (config != NULL) + res = NULL; + } - if (!res) /* dictionary doesn't know this lexeme */ - continue; + if (config != NULL) + LexizeExecClearDictStates(ld); + else if (token->type == 0) + DictStateListClear(&ld->dslist); + } - if (res->flags & TSL_FILTER) - { - curValLemm = res->lexeme; - curValLenLemm = strlen(res->lexeme); - continue; - } + if (prevIterationResult) + res = prevIterationResult; + else + { + int i; - RemoveHead(ld); - setCorrLex(ld, correspondLexem); - return res; + for (i = 0; i < ld->dslist.listLength; i++) + { + if (ld->dslist.states[i].storeToAccepted) + { + LPLAddTailCopy(&ld->dslist.states[i].acceptedTokens, token); + accepted = true; + ld->dslist.states[i].storeToAccepted = false; + } + else + { + LPLAddTailCopy(&ld->dslist.states[i].intermediateTokens, token); } - - RemoveHead(ld); } } - else - { /* curDictId is valid */ - dict = lookup_ts_dictionary_cache(ld->curDictId); + if (removeHead) + RemoveHead(ld); + + if (ld->dslist.listLength > 0) + { /* - * Dictionary ld->curDictId asks us about following words + * There is at least one thesaurus dictionary in the middle of + * processing. Delay return of the result to avoid wrong lexemes in + * case of thesaurus phrase rejection. */ + ResultStorageAdd(&ld->delayedResults, token, res); + if (accepted) + ResultStorageMoveToAccepted(&ld->delayedResults); - while (ld->curSub) + /* + * Current value of res should not be cleared, because it is stored in + * LexemesBuffer + */ + res = NULL; + } + else + { + if (ld->towork.head == NULL) { - ParsedLex *curVal = ld->curSub; - - map = ld->cfg->map + curVal->type; - - if (curVal->type != 0) - { - bool dictExists = false; - - if (curVal->type >= ld->cfg->lenmap || map->len == 0) - { - /* skip this type of lexeme */ - ld->curSub = curVal->next; - continue; - } + TSLexeme *oldAccepted = ld->delayedResults.accepted; - /* - * We should be sure that current type of lexeme is recognized - * by our dictionary: we just check is it exist in list of - * dictionaries ? - */ - for (i = 0; i < map->len && !dictExists; i++) - if (ld->curDictId == DatumGetObjectId(map->dictIds[i])) - dictExists = true; - - if (!dictExists) - { - /* - * Dictionary can't work with current tpe of lexeme, - * return to basic mode and redo all stored lexemes - */ - ld->curDictId = InvalidOid; - return LexizeExec(ld, correspondLexem); - } - } + ld->delayedResults.accepted = TSLexemeUnionOpt(ld->delayedResults.accepted, ld->delayedResults.lexemes, true); + if (oldAccepted) + pfree(oldAccepted); + } - ld->dictState.isend = (curVal->type == 0) ? true : false; - ld->dictState.getnext = false; + /* + * Add accepted delayed results to the output of the parsing. All + * lexemes returned during thesaurus phrase processing should be + * returned simultaneously, since all phrase tokens are processed as + * one. + */ + if (ld->delayedResults.accepted != NULL) + { + /* + * Previous value of res should not be cleared, because it is + * stored in LexemesBuffer + */ + res = TSLexemeUnionOpt(ld->delayedResults.accepted, res, prevIterationResult == NULL); - res = (TSLexeme *) DatumGetPointer(FunctionCall4( - &(dict->lexize), - PointerGetDatum(dict->dictData), - PointerGetDatum(curVal->lemm), - Int32GetDatum(curVal->lenlemm), - PointerGetDatum(&ld->dictState) - )); + ResultStorageClearLexemes(&ld->delayedResults); + ResultStorageClearAccepted(&ld->delayedResults); + } + setCorrLex(ld, correspondLexem); + } - if (ld->dictState.getnext) - { - /* Dictionary wants one more */ - ld->curSub = curVal->next; - if (res) - setNewTmpRes(ld, curVal, res); - continue; - } + if (resetSkipDictionary) + ld->skipDictionary = InvalidOid; - if (res || ld->tmpRes) - { - /* - * Dictionary normalizes lexemes, so we remove from stack all - * used lexemes, return to basic mode and redo end of stack - * (if it exists) - */ - if (res) - { - moveToWaste(ld, ld->curSub); - } - else - { - res = ld->tmpRes; - moveToWaste(ld, ld->lastRes); - } + res = TSLexemeFilterMulti(res); + if (res) + res = TSLexemeRemoveDuplications(res); - /* reset to initial state */ - ld->curDictId = InvalidOid; - ld->posDict = 0; - ld->lastRes = NULL; - ld->tmpRes = NULL; - setCorrLex(ld, correspondLexem); - return res; - } + /* + * Copy result since it may be stored in LexemesBuffere and removed at the + * next step. + */ + if (res) + { + TSLexeme *oldRes = res; + int resSize = TSLexemeGetSize(res); - /* - * Dict don't want next lexem and didn't recognize anything, redo - * from ld->towork.head - */ - ld->curDictId = InvalidOid; - return LexizeExec(ld, correspondLexem); - } + res = palloc0(sizeof(TSLexeme) * (resSize + 1)); + memcpy(res, oldRes, sizeof(TSLexeme) * resSize); } - setCorrLex(ld, correspondLexem); - return NULL; + LexemesBufferClear(&ld->buffer); + return res; } +/*------------------- + * ts_parse API functions + *------------------- + */ + /* * Parse string and lexize words. * @@ -357,7 +1473,7 @@ LexizeExec(LexizeData *ld, ParsedLex **correspondLexem) void parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen) { - int type, + int type = -1, lenlemm; char *lemm = NULL; LexizeData ldata; @@ -375,36 +1491,42 @@ parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen) LexizeInit(&ldata, cfg); + type = 1; do { - type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken), - PointerGetDatum(prsdata), - PointerGetDatum(&lemm), - PointerGetDatum(&lenlemm))); - - if (type > 0 && lenlemm >= MAXSTRLEN) + if (type > 0) { + type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken), + PointerGetDatum(prsdata), + PointerGetDatum(&lemm), + PointerGetDatum(&lenlemm))); + + if (type > 0 && lenlemm >= MAXSTRLEN) + { #ifdef IGNORE_LONGLEXEME - ereport(NOTICE, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("word is too long to be indexed"), - errdetail("Words longer than %d characters are ignored.", - MAXSTRLEN))); - continue; + ereport(NOTICE, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("word is too long to be indexed"), + errdetail("Words longer than %d characters are ignored.", + MAXSTRLEN))); + continue; #else - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("word is too long to be indexed"), - errdetail("Words longer than %d characters are ignored.", - MAXSTRLEN))); + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("word is too long to be indexed"), + errdetail("Words longer than %d characters are ignored.", + MAXSTRLEN))); #endif - } + } - LexizeAddLemm(&ldata, type, lemm, lenlemm); + LexizeAddLemm(&ldata, type, lemm, lenlemm); + } while ((norms = LexizeExec(&ldata, NULL)) != NULL) { - TSLexeme *ptr = norms; + TSLexeme *ptr; + + ptr = norms; prs->pos++; /* set pos */ @@ -429,14 +1551,246 @@ parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen) } pfree(norms); } - } while (type > 0); + } while (type > 0 || ldata.towork.head); FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata)); } +/*------------------- + * ts_debug and helper functions + *------------------- + */ + +/* + * Free memory occupied by temporary TSMapElement + */ + +static void +ts_debug_free_rule(TSMapElement *element) +{ + if (element != NULL && element->type == TSMAP_EXPRESSION) + { + ts_debug_free_rule(element->value.objectExpression->left); + ts_debug_free_rule(element->value.objectExpression->right); + pfree(element->value.objectExpression); + pfree(element); + } +} + +/* + * Initialize SRF context and text parser for ts_debug execution. + */ +static void +ts_debug_init(Oid cfgId, text *inputText, FunctionCallInfo fcinfo) +{ + TupleDesc tupdesc; + char *buf; + int buflen; + FuncCallContext *funcctx; + MemoryContext oldcontext; + TSDebugContext *context; + + funcctx = SRF_FIRSTCALL_INIT(); + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + buf = text_to_cstring(inputText); + buflen = strlen(buf); + + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("function returning record called in context " + "that cannot accept type record"))); + + funcctx->user_fctx = palloc0(sizeof(TSDebugContext)); + funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc); + + context = funcctx->user_fctx; + context->cfg = lookup_ts_config_cache(cfgId); + context->prsobj = lookup_ts_parser_cache(context->cfg->prsId); + + context->tokenTypes = (LexDescr *) DatumGetPointer(OidFunctionCall1(context->prsobj->lextypeOid, + (Datum) 0)); + + context->prsdata = (void *) DatumGetPointer(FunctionCall2(&context->prsobj->prsstart, + PointerGetDatum(buf), + Int32GetDatum(buflen))); + LexizeInit(&context->ldata, context->cfg); + context->ldata.debugContext = true; + context->tokentype = 1; + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Get one token from input text and add it to processing queue. + */ +static void +ts_debug_get_token(FuncCallContext *funcctx) +{ + TSDebugContext *context; + MemoryContext oldcontext; + int lenlemm; + char *lemm = NULL; + + context = funcctx->user_fctx; + + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + context->tokentype = DatumGetInt32(FunctionCall3(&(context->prsobj->prstoken), + PointerGetDatum(context->prsdata), + PointerGetDatum(&lemm), + PointerGetDatum(&lenlemm))); + + if (context->tokentype > 0 && lenlemm >= MAXSTRLEN) + { +#ifdef IGNORE_LONGLEXEME + ereport(NOTICE, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("word is too long to be indexed"), + errdetail("Words longer than %d characters are ignored.", + MAXSTRLEN))); +#else + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("word is too long to be indexed"), + errdetail("Words longer than %d characters are ignored.", + MAXSTRLEN))); +#endif + } + + LexizeAddLemm(&context->ldata, context->tokentype, lemm, lenlemm); + MemoryContextSwitchTo(oldcontext); +} + /* + * Parse text and print debug information, such as token type, dictionary map + * configuration, selected command and lexemes for each token. + * Arguments: regconfiguration(Oid) cfgId, text *inputText + */ +Datum +ts_debug(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + TSDebugContext *context; + MemoryContext oldcontext; + + if (SRF_IS_FIRSTCALL()) + { + Oid cfgId = PG_GETARG_OID(0); + text *inputText = PG_GETARG_TEXT_P(1); + + ts_debug_init(cfgId, inputText, fcinfo); + } + + funcctx = SRF_PERCALL_SETUP(); + context = funcctx->user_fctx; + + while (context->tokentype > 0 && context->leftTokens == NULL) + { + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + ts_debug_get_token(funcctx); + + context->savedLexemes = LexizeExec(&context->ldata, &(context->leftTokens)); + + MemoryContextSwitchTo(oldcontext); + } + + while (context->leftTokens == NULL && context->ldata.towork.head != NULL) + context->savedLexemes = LexizeExec(&context->ldata, &(context->leftTokens)); + + if (context->leftTokens && context->leftTokens && context->leftTokens->type > 0) + { + HeapTuple tuple; + Datum result; + char **values; + ParsedLex *lex = context->leftTokens; + StringInfo str = NULL; + TSLexeme *ptr; + + values = palloc0(sizeof(char *) * 7); + str = makeStringInfo(); + initStringInfo(str); + + values[0] = context->tokenTypes[lex->type - 1].alias; + values[1] = context->tokenTypes[lex->type - 1].descr; + + values[2] = palloc0(sizeof(char) * (lex->lenlemm + 1)); + memcpy(values[2], lex->lemm, sizeof(char) * lex->lenlemm); + + initStringInfo(str); + appendStringInfoChar(str, '{'); + if (lex->type < context->ldata.cfg->lenmap && context->ldata.cfg->map[lex->type]) + { + Oid *dictionaries = TSMapGetDictionaries(context->ldata.cfg->map[lex->type]); + Oid *currentDictionary = NULL; + for (currentDictionary = dictionaries; *currentDictionary != InvalidOid; currentDictionary++) + { + if (currentDictionary != dictionaries) + appendStringInfoChar(str, ','); + + TSMapPrintDictName(*currentDictionary, str); + } + } + appendStringInfoChar(str, '}'); + values[3] = str->data; + + if (lex->type < context->ldata.cfg->lenmap && context->ldata.cfg->map[lex->type]) + { + initStringInfo(str); + TSMapPrintElement(context->ldata.cfg->map[lex->type], str); + values[4] = str->data; + + initStringInfo(str); + if (lex->relatedRule) + { + TSMapPrintElement(lex->relatedRule, str); + values[5] = str->data; + str = makeStringInfo(); + initStringInfo(str); + ts_debug_free_rule(lex->relatedRule); + lex->relatedRule = NULL; + } + } + + initStringInfo(str); + ptr = context->savedLexemes; + if (context->savedLexemes) + appendStringInfoChar(str, '{'); + + while (ptr && ptr->lexeme) + { + if (ptr != context->savedLexemes) + appendStringInfoString(str, ", "); + appendStringInfoString(str, ptr->lexeme); + ptr++; + } + if (context->savedLexemes) + appendStringInfoChar(str, '}'); + if (context->savedLexemes) + values[6] = str->data; + else + values[6] = NULL; + + tuple = BuildTupleFromCStrings(funcctx->attinmeta, values); + result = HeapTupleGetDatum(tuple); + + context->leftTokens = lex->next; + pfree(lex); + if (context->leftTokens == NULL && context->savedLexemes) + pfree(context->savedLexemes); + + SRF_RETURN_NEXT(funcctx, result); + } + + FunctionCall1(&(context->prsobj->prsend), PointerGetDatum(context->prsdata)); + SRF_RETURN_DONE(funcctx); +} + +/*------------------- * Headline framework + *------------------- */ + static void hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type) { @@ -532,12 +1886,12 @@ addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme void hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen) { - int type, + int type = -1, lenlemm; char *lemm = NULL; LexizeData ldata; TSLexeme *norms; - ParsedLex *lexs; + ParsedLex *lexs = NULL; TSConfigCacheEntry *cfg; TSParserCacheEntry *prsobj; void *prsdata; @@ -551,32 +1905,36 @@ hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int bu LexizeInit(&ldata, cfg); + type = 1; do { - type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken), - PointerGetDatum(prsdata), - PointerGetDatum(&lemm), - PointerGetDatum(&lenlemm))); - - if (type > 0 && lenlemm >= MAXSTRLEN) + if (type > 0) { + type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken), + PointerGetDatum(prsdata), + PointerGetDatum(&lemm), + PointerGetDatum(&lenlemm))); + + if (type > 0 && lenlemm >= MAXSTRLEN) + { #ifdef IGNORE_LONGLEXEME - ereport(NOTICE, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("word is too long to be indexed"), - errdetail("Words longer than %d characters are ignored.", - MAXSTRLEN))); - continue; + ereport(NOTICE, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("word is too long to be indexed"), + errdetail("Words longer than %d characters are ignored.", + MAXSTRLEN))); + continue; #else - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("word is too long to be indexed"), - errdetail("Words longer than %d characters are ignored.", - MAXSTRLEN))); + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("word is too long to be indexed"), + errdetail("Words longer than %d characters are ignored.", + MAXSTRLEN))); #endif - } + } - LexizeAddLemm(&ldata, type, lemm, lenlemm); + LexizeAddLemm(&ldata, type, lemm, lenlemm); + } do { @@ -587,9 +1945,10 @@ hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int bu } else addHLParsedLex(prs, query, lexs, NULL); + lexs = NULL; } while (norms); - } while (type > 0); + } while (type > 0 || ldata.towork.head); FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata)); } @@ -642,14 +2001,14 @@ generateHeadline(HeadlineParsedText *prs) } else if (!wrd->skip) { - if (wrd->selected) + if (wrd->selected && (wrd == prs->words || !(wrd - 1)->selected)) { memcpy(ptr, prs->startsel, prs->startsellen); ptr += prs->startsellen; } memcpy(ptr, wrd->word, wrd->len); ptr += wrd->len; - if (wrd->selected) + if (wrd->selected && ((wrd + 1 - prs->words) == prs->curwords || !(wrd + 1)->selected)) { memcpy(ptr, prs->stopsel, prs->stopsellen); ptr += prs->stopsellen; diff --git a/src/backend/tsearch/ts_utils.c b/src/backend/tsearch/ts_utils.c index f6e03aea4f..0dd846bece 100644 --- a/src/backend/tsearch/ts_utils.c +++ b/src/backend/tsearch/ts_utils.c @@ -20,7 +20,6 @@ #include "tsearch/ts_locale.h" #include "tsearch/ts_utils.h" - /* * Given the base name and extension of a tsearch config file, return * its full path name. The base name is assumed to be user-supplied, diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c index 2b381782a3..f251e83ff6 100644 --- a/src/backend/utils/cache/syscache.c +++ b/src/backend/utils/cache/syscache.c @@ -828,11 +828,10 @@ static const struct cachedesc cacheinfo[] = { }, {TSConfigMapRelationId, /* TSCONFIGMAP */ TSConfigMapIndexId, - 3, + 2, { Anum_pg_ts_config_map_mapcfg, Anum_pg_ts_config_map_maptokentype, - Anum_pg_ts_config_map_mapseqno, 0 }, 2 diff --git a/src/backend/utils/cache/ts_cache.c b/src/backend/utils/cache/ts_cache.c index 97347780d3..1ff1a9255c 100644 --- a/src/backend/utils/cache/ts_cache.c +++ b/src/backend/utils/cache/ts_cache.c @@ -39,6 +39,7 @@ #include "catalog/pg_ts_template.h" #include "commands/defrem.h" #include "tsearch/ts_cache.h" +#include "tsearch/ts_configmap.h" #include "utils/builtins.h" #include "utils/catcache.h" #include "utils/fmgroids.h" @@ -51,13 +52,12 @@ /* - * MAXTOKENTYPE/MAXDICTSPERTT are arbitrary limits on the workspace size + * MAXTOKENTYPE is arbitrary limits on the workspace size * used in lookup_ts_config_cache(). We could avoid hardwiring a limit * by making the workspace dynamically enlargeable, but it seems unlikely * to be worth the trouble. */ -#define MAXTOKENTYPE 256 -#define MAXDICTSPERTT 100 +#define MAXTOKENTYPE 256 static HTAB *TSParserCacheHash = NULL; @@ -418,11 +418,10 @@ lookup_ts_config_cache(Oid cfgId) ScanKeyData mapskey; SysScanDesc mapscan; HeapTuple maptup; - ListDictionary maplists[MAXTOKENTYPE + 1]; - Oid mapdicts[MAXDICTSPERTT]; + TSMapElement *mapconfigs[MAXTOKENTYPE + 1]; int maxtokentype; - int ndicts; int i; + TSMapElement *tmpConfig; tp = SearchSysCache1(TSCONFIGOID, ObjectIdGetDatum(cfgId)); if (!HeapTupleIsValid(tp)) @@ -453,8 +452,8 @@ lookup_ts_config_cache(Oid cfgId) if (entry->map) { for (i = 0; i < entry->lenmap; i++) - if (entry->map[i].dictIds) - pfree(entry->map[i].dictIds); + if (entry->map[i]) + TSMapElementFree(entry->map[i]); pfree(entry->map); } } @@ -468,13 +467,11 @@ lookup_ts_config_cache(Oid cfgId) /* * Scan pg_ts_config_map to gather dictionary list for each token type * - * Because the index is on (mapcfg, maptokentype, mapseqno), we will - * see the entries in maptokentype order, and in mapseqno order for - * each token type, even though we didn't explicitly ask for that. + * Because the index is on (mapcfg, maptokentype), we will see the + * entries in maptokentype order even though we didn't explicitly ask + * for that. */ - MemSet(maplists, 0, sizeof(maplists)); maxtokentype = 0; - ndicts = 0; ScanKeyInit(&mapskey, Anum_pg_ts_config_map_mapcfg, @@ -486,6 +483,7 @@ lookup_ts_config_cache(Oid cfgId) mapscan = systable_beginscan_ordered(maprel, mapidx, NULL, 1, &mapskey); + memset(mapconfigs, 0, sizeof(mapconfigs)); while ((maptup = systable_getnext_ordered(mapscan, ForwardScanDirection)) != NULL) { Form_pg_ts_config_map cfgmap = (Form_pg_ts_config_map) GETSTRUCT(maptup); @@ -495,51 +493,27 @@ lookup_ts_config_cache(Oid cfgId) elog(ERROR, "maptokentype value %d is out of range", toktype); if (toktype < maxtokentype) elog(ERROR, "maptokentype entries are out of order"); - if (toktype > maxtokentype) - { - /* starting a new token type, but first save the prior data */ - if (ndicts > 0) - { - maplists[maxtokentype].len = ndicts; - maplists[maxtokentype].dictIds = (Oid *) - MemoryContextAlloc(CacheMemoryContext, - sizeof(Oid) * ndicts); - memcpy(maplists[maxtokentype].dictIds, mapdicts, - sizeof(Oid) * ndicts); - } - maxtokentype = toktype; - mapdicts[0] = cfgmap->mapdict; - ndicts = 1; - } - else - { - /* continuing data for current token type */ - if (ndicts >= MAXDICTSPERTT) - elog(ERROR, "too many pg_ts_config_map entries for one token type"); - mapdicts[ndicts++] = cfgmap->mapdict; - } + + maxtokentype = toktype; + tmpConfig = JsonbToTSMap(DatumGetJsonbP(&cfgmap->mapdicts)); + mapconfigs[maxtokentype] = TSMapMoveToMemoryContext(tmpConfig, CacheMemoryContext); + TSMapElementFree(tmpConfig); + tmpConfig = NULL; } systable_endscan_ordered(mapscan); index_close(mapidx, AccessShareLock); heap_close(maprel, AccessShareLock); - if (ndicts > 0) + if (maxtokentype > 0) { - /* save the last token type's dictionaries */ - maplists[maxtokentype].len = ndicts; - maplists[maxtokentype].dictIds = (Oid *) - MemoryContextAlloc(CacheMemoryContext, - sizeof(Oid) * ndicts); - memcpy(maplists[maxtokentype].dictIds, mapdicts, - sizeof(Oid) * ndicts); - /* and save the overall map */ + /* save the overall map */ entry->lenmap = maxtokentype + 1; - entry->map = (ListDictionary *) + entry->map = (TSMapElement * *) MemoryContextAlloc(CacheMemoryContext, - sizeof(ListDictionary) * entry->lenmap); - memcpy(entry->map, maplists, - sizeof(ListDictionary) * entry->lenmap); + sizeof(TSMapElement *) * entry->lenmap); + memcpy(entry->map, mapconfigs, + sizeof(TSMapElement *) * entry->lenmap); } entry->isvalid = true; diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index d066f4f00b..c5cb3c62f7 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -14223,15 +14223,29 @@ dumpTSConfig(Archive *fout, TSConfigInfo *cfginfo) PQclear(res); resetPQExpBuffer(query); - appendPQExpBuffer(query, - "SELECT\n" - " ( SELECT alias FROM pg_catalog.ts_token_type('%u'::pg_catalog.oid) AS t\n" - " WHERE t.tokid = m.maptokentype ) AS tokenname,\n" - " m.mapdict::pg_catalog.regdictionary AS dictname\n" - "FROM pg_catalog.pg_ts_config_map AS m\n" - "WHERE m.mapcfg = '%u'\n" - "ORDER BY m.mapcfg, m.maptokentype, m.mapseqno", - cfginfo->cfgparser, cfginfo->dobj.catId.oid); + + if (fout->remoteVersion >= 110000) + appendPQExpBuffer(query, + "SELECT\n" + " ( SELECT alias FROM pg_catalog.ts_token_type('%u'::pg_catalog.oid) AS t\n" + " WHERE t.tokid = m.maptokentype ) AS tokenname,\n" + " dictionary_mapping_to_text(m.mapcfg, m.maptokentype) AS dictname\n" + "FROM pg_catalog.pg_ts_config_map AS m\n" + "WHERE m.mapcfg = '%u'\n" + "GROUP BY m.mapcfg, m.maptokentype\n" + "ORDER BY m.mapcfg, m.maptokentype", + cfginfo->cfgparser, cfginfo->dobj.catId.oid); + else + appendPQExpBuffer(query, + "SELECT\n" + " ( SELECT alias FROM pg_catalog.ts_token_type('%u'::pg_catalog.oid) AS t\n" + " WHERE t.tokid = m.maptokentype ) AS tokenname,\n" + " m.mapdict::pg_catalog.regdictionary AS dictname\n" + "FROM pg_catalog.pg_ts_config_map AS m\n" + "WHERE m.mapcfg = '%u'\n" + "GROUP BY m.mapcfg, m.maptokentype, m.mapseqno\n" + "ORDER BY m.mapcfg, m.maptokentype", + cfginfo->cfgparser, cfginfo->dobj.catId.oid); res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK); ntups = PQntuples(res); diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c index 0c3be1f504..729242e8e0 100644 --- a/src/bin/psql/describe.c +++ b/src/bin/psql/describe.c @@ -4646,25 +4646,41 @@ describeOneTSConfig(const char *oid, const char *nspname, const char *cfgname, initPQExpBuffer(&buf); - printfPQExpBuffer(&buf, - "SELECT\n" - " ( SELECT t.alias FROM\n" - " pg_catalog.ts_token_type(c.cfgparser) AS t\n" - " WHERE t.tokid = m.maptokentype ) AS \"%s\",\n" - " pg_catalog.btrim(\n" - " ARRAY( SELECT mm.mapdict::pg_catalog.regdictionary\n" - " FROM pg_catalog.pg_ts_config_map AS mm\n" - " WHERE mm.mapcfg = m.mapcfg AND mm.maptokentype = m.maptokentype\n" - " ORDER BY mapcfg, maptokentype, mapseqno\n" - " ) :: pg_catalog.text,\n" - " '{}') AS \"%s\"\n" - "FROM pg_catalog.pg_ts_config AS c, pg_catalog.pg_ts_config_map AS m\n" - "WHERE c.oid = '%s' AND m.mapcfg = c.oid\n" - "GROUP BY m.mapcfg, m.maptokentype, c.cfgparser\n" - "ORDER BY 1;", - gettext_noop("Token"), - gettext_noop("Dictionaries"), - oid); + if (pset.sversion >= 110000) + printfPQExpBuffer(&buf, + "SELECT\n" + " ( SELECT t.alias FROM\n" + " pg_catalog.ts_token_type(c.cfgparser) AS t\n" + " WHERE t.tokid = m.maptokentype ) AS \"%s\",\n" + " dictionary_mapping_to_text(m.mapcfg, m.maptokentype) AS \"%s\"\n" + "FROM pg_catalog.pg_ts_config AS c, pg_catalog.pg_ts_config_map AS m\n" + "WHERE c.oid = '%s' AND m.mapcfg = c.oid\n" + "GROUP BY m.mapcfg, m.maptokentype, c.cfgparser\n" + "ORDER BY 1;", + gettext_noop("Token"), + gettext_noop("Dictionaries"), + oid); + else + printfPQExpBuffer(&buf, + "SELECT\n" + " ( SELECT t.alias FROM\n" + " pg_catalog.ts_token_type(c.cfgparser) AS t\n" + " WHERE t.tokid = m.maptokentype ) AS \"%s\",\n" + " pg_catalog.btrim(\n" + " ARRAY( SELECT mm.mapdict::pg_catalog.regdictionary\n" + " FROM pg_catalog.pg_ts_config_map AS mm\n" + " WHERE mm.mapcfg = m.mapcfg AND mm.maptokentype = m.maptokentype\n" + " ORDER BY mapcfg, maptokentype, mapseqno\n" + " ) :: pg_catalog.text,\n" + " '{}') AS \"%s\"\n" + "FROM pg_catalog.pg_ts_config AS c, pg_catalog.pg_ts_config_map AS m\n" + "WHERE c.oid = '%s' AND m.mapcfg = c.oid\n" + "GROUP BY m.mapcfg, m.maptokentype, c.cfgparser\n" + "ORDER BY 1;", + gettext_noop("Token"), + gettext_noop("Dictionaries"), + oid); + res = PSQLexec(buf.data); termPQExpBuffer(&buf); diff --git a/src/include/catalog/indexing.h b/src/include/catalog/indexing.h index 7dd9d108d6..589bce476b 100644 --- a/src/include/catalog/indexing.h +++ b/src/include/catalog/indexing.h @@ -262,7 +262,7 @@ DECLARE_UNIQUE_INDEX(pg_ts_config_cfgname_index, 3608, on pg_ts_config using btr DECLARE_UNIQUE_INDEX(pg_ts_config_oid_index, 3712, on pg_ts_config using btree(oid oid_ops)); #define TSConfigOidIndexId 3712 -DECLARE_UNIQUE_INDEX(pg_ts_config_map_index, 3609, on pg_ts_config_map using btree(mapcfg oid_ops, maptokentype int4_ops, mapseqno int4_ops)); +DECLARE_UNIQUE_INDEX(pg_ts_config_map_index, 3609, on pg_ts_config_map using btree(mapcfg oid_ops, maptokentype int4_ops)); #define TSConfigMapIndexId 3609 DECLARE_UNIQUE_INDEX(pg_ts_dict_dictname_index, 3604, on pg_ts_dict using btree(dictname name_ops, dictnamespace oid_ops)); diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 9bf20c059b..bd9549ac39 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -4988,6 +4988,12 @@ DESCR("transform jsonb to tsvector"); DATA(insert OID = 4212 ( to_tsvector PGNSP PGUID 12 100 0 0 0 f f f t f i s 2 0 3614 "3734 114" _null_ _null_ _null_ _null_ _null_ json_to_tsvector_byid _null_ _null_ _null_ )); DESCR("transform json to tsvector"); +DATA(insert OID = 8891 ( dictionary_mapping_to_text PGNSP PGUID 12 100 0 0 0 f f f t f s s 2 0 25 "26 23" _null_ _null_ _null_ _null_ _null_ dictionary_mapping_to_text _null_ _null_ _null_ )); +DESCR("returns text representation of dictionary configuration map"); + +DATA(insert OID = 8892 ( ts_debug PGNSP PGUID 12 100 1 0 0 f f f t t s s 2 0 2249 "3734 25" "{3734,25,25,25,25,3770,25,25,1009}" "{i,i,o,o,o,o,o,o,o}" "{cfgId,inputText,alias,description,token,dictionaries,configuration,command,lexemes}" _null_ _null_ ts_debug _null_ _null_ _null_)); +DESCR("debug function for text search configuration"); + DATA(insert OID = 3752 ( tsvector_update_trigger PGNSP PGUID 12 1 0 0 0 f f f f f v s 0 0 2279 "" _null_ _null_ _null_ _null_ _null_ tsvector_update_trigger_byid _null_ _null_ _null_ )); DESCR("trigger for automatic update of tsvector column"); DATA(insert OID = 3753 ( tsvector_update_trigger_column PGNSP PGUID 12 1 0 0 0 f f f f f v s 0 0 2279 "" _null_ _null_ _null_ _null_ _null_ tsvector_update_trigger_bycolumn _null_ _null_ _null_ )); diff --git a/src/include/catalog/pg_ts_config_map.h b/src/include/catalog/pg_ts_config_map.h index a3d9e3f21f..65a9a73369 100644 --- a/src/include/catalog/pg_ts_config_map.h +++ b/src/include/catalog/pg_ts_config_map.h @@ -22,6 +22,7 @@ #define PG_TS_CONFIG_MAP_H #include "catalog/genbki.h" +#include "utils/jsonb.h" /* ---------------- * pg_ts_config_map definition. cpp turns this into @@ -30,49 +31,114 @@ */ #define TSConfigMapRelationId 3603 +/* + * Create a typedef in order to use same type name in + * generated DB initialization script and C source code + */ +typedef Jsonb jsonb; + CATALOG(pg_ts_config_map,3603) BKI_WITHOUT_OIDS { Oid mapcfg; /* OID of configuration owning this entry */ int32 maptokentype; /* token type from parser */ - int32 mapseqno; /* order in which to consult dictionaries */ - Oid mapdict; /* dictionary to consult */ + + /* + * mapdicts is the only one variable-length field so it is safe to use + * it directly, without hiding from C interface. + */ + jsonb mapdicts; /* dictionary map Jsonb representation */ } FormData_pg_ts_config_map; typedef FormData_pg_ts_config_map *Form_pg_ts_config_map; +/* + * Element of the mapping expression tree + */ +typedef struct TSMapElement +{ + int type; /* Type of the element */ + union + { + struct TSMapExpression *objectExpression; + struct TSMapCase *objectCase; + Oid objectDictionary; + void *object; + } value; + struct TSMapElement *parent; /* Parent in the expression tree */ +} TSMapElement; + +/* + * Representation of expression with operator and two operands + */ +typedef struct TSMapExpression +{ + int operator; + TSMapElement *left; + TSMapElement *right; +} TSMapExpression; + +/* + * Representation of CASE structure inside database + */ +typedef struct TSMapCase +{ + TSMapElement *condition; + TSMapElement *command; + TSMapElement *elsebranch; + bool match; /* If false, NO MATCH is used */ +} TSMapCase; + /* ---------------- - * compiler constants for pg_ts_config_map + * Compiler constants for pg_ts_config_map * ---------------- */ -#define Natts_pg_ts_config_map 4 +#define Natts_pg_ts_config_map 3 #define Anum_pg_ts_config_map_mapcfg 1 #define Anum_pg_ts_config_map_maptokentype 2 -#define Anum_pg_ts_config_map_mapseqno 3 -#define Anum_pg_ts_config_map_mapdict 4 +#define Anum_pg_ts_config_map_mapdicts 3 + +/* ---------------- + * Dictionary map operators + * ---------------- + */ +#define TSMAP_OP_MAP 1 +#define TSMAP_OP_UNION 2 +#define TSMAP_OP_EXCEPT 3 +#define TSMAP_OP_INTERSECT 4 +#define TSMAP_OP_COMMA 5 + +/* ---------------- + * TSMapElement object types + * ---------------- + */ +#define TSMAP_EXPRESSION 1 +#define TSMAP_CASE 2 +#define TSMAP_DICTIONARY 3 +#define TSMAP_KEEP 4 /* ---------------- * initial contents of pg_ts_config_map * ---------------- */ -DATA(insert ( 3748 1 1 3765 )); -DATA(insert ( 3748 2 1 3765 )); -DATA(insert ( 3748 3 1 3765 )); -DATA(insert ( 3748 4 1 3765 )); -DATA(insert ( 3748 5 1 3765 )); -DATA(insert ( 3748 6 1 3765 )); -DATA(insert ( 3748 7 1 3765 )); -DATA(insert ( 3748 8 1 3765 )); -DATA(insert ( 3748 9 1 3765 )); -DATA(insert ( 3748 10 1 3765 )); -DATA(insert ( 3748 11 1 3765 )); -DATA(insert ( 3748 15 1 3765 )); -DATA(insert ( 3748 16 1 3765 )); -DATA(insert ( 3748 17 1 3765 )); -DATA(insert ( 3748 18 1 3765 )); -DATA(insert ( 3748 19 1 3765 )); -DATA(insert ( 3748 20 1 3765 )); -DATA(insert ( 3748 21 1 3765 )); -DATA(insert ( 3748 22 1 3765 )); +DATA(insert ( 3748 1 "[3765]" )); +DATA(insert ( 3748 2 "[3765]" )); +DATA(insert ( 3748 3 "[3765]" )); +DATA(insert ( 3748 4 "[3765]" )); +DATA(insert ( 3748 5 "[3765]" )); +DATA(insert ( 3748 6 "[3765]" )); +DATA(insert ( 3748 7 "[3765]" )); +DATA(insert ( 3748 8 "[3765]" )); +DATA(insert ( 3748 9 "[3765]" )); +DATA(insert ( 3748 10 "[3765]" )); +DATA(insert ( 3748 11 "[3765]" )); +DATA(insert ( 3748 15 "[3765]" )); +DATA(insert ( 3748 16 "[3765]" )); +DATA(insert ( 3748 17 "[3765]" )); +DATA(insert ( 3748 18 "[3765]" )); +DATA(insert ( 3748 19 "[3765]" )); +DATA(insert ( 3748 20 "[3765]" )); +DATA(insert ( 3748 21 "[3765]" )); +DATA(insert ( 3748 22 "[3765]" )); #endif /* PG_TS_CONFIG_MAP_H */ diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index fce48026b6..1d3896d494 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -385,6 +385,9 @@ typedef enum NodeTag T_CreateEnumStmt, T_CreateRangeStmt, T_AlterEnumStmt, + T_DictMapExprElem, + T_DictMapElem, + T_DictMapCase, T_AlterTSDictionaryStmt, T_AlterTSConfigurationStmt, T_CreateFdwStmt, diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 699fa77bc7..6103b12cce 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -3434,6 +3434,50 @@ typedef enum AlterTSConfigType ALTER_TSCONFIG_DROP_MAPPING } AlterTSConfigType; +/* + * TS Configuration expression tree element's types + */ +typedef enum DictMapElemType +{ + DICT_MAP_CASE, + DICT_MAP_EXPRESSION, + DICT_MAP_KEEP, + DICT_MAP_DICTIONARY +} DictMapElemType; + +/* + * TS Configuration expression tree abstract element + */ +typedef struct DictMapElem +{ + NodeTag type; + int8 kind; /* See DictMapElemType */ + void *data; /* Type should be detected by kind value */ +} DictMapElem; + +/* + * TS Configuration expression tree element with operator and operands + */ +typedef struct DictMapExprElem +{ + NodeTag type; + DictMapElem *left; + DictMapElem *right; + int8 oper; +} DictMapExprElem; + +/* + * TS Configuration expression tree CASE element + */ +typedef struct DictMapCase +{ + NodeTag type; + struct DictMapElem *condition; + struct DictMapElem *command; + struct DictMapElem *elsebranch; + bool match; +} DictMapCase; + typedef struct AlterTSConfigurationStmt { NodeTag type; @@ -3446,6 +3490,7 @@ typedef struct AlterTSConfigurationStmt */ List *tokentype; /* list of Value strings */ List *dicts; /* list of list of Value strings */ + DictMapElem *dict_map; /* tree of the mapping expression */ bool override; /* if true - remove old variant */ bool replace; /* if true - replace dictionary by another */ bool missing_ok; /* for DROP - skip error if missing? */ diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h index 4dff55a8e9..3371f286a8 100644 --- a/src/include/parser/kwlist.h +++ b/src/include/parser/kwlist.h @@ -220,6 +220,7 @@ PG_KEYWORD("is", IS, TYPE_FUNC_NAME_KEYWORD) PG_KEYWORD("isnull", ISNULL, TYPE_FUNC_NAME_KEYWORD) PG_KEYWORD("isolation", ISOLATION, UNRESERVED_KEYWORD) PG_KEYWORD("join", JOIN, TYPE_FUNC_NAME_KEYWORD) +PG_KEYWORD("keep", KEEP, RESERVED_KEYWORD) PG_KEYWORD("key", KEY, UNRESERVED_KEYWORD) PG_KEYWORD("label", LABEL, UNRESERVED_KEYWORD) PG_KEYWORD("language", LANGUAGE, UNRESERVED_KEYWORD) @@ -242,6 +243,7 @@ PG_KEYWORD("location", LOCATION, UNRESERVED_KEYWORD) PG_KEYWORD("lock", LOCK_P, UNRESERVED_KEYWORD) PG_KEYWORD("locked", LOCKED, UNRESERVED_KEYWORD) PG_KEYWORD("logged", LOGGED, UNRESERVED_KEYWORD) +PG_KEYWORD("map", MAP, UNRESERVED_KEYWORD) PG_KEYWORD("mapping", MAPPING, UNRESERVED_KEYWORD) PG_KEYWORD("match", MATCH, UNRESERVED_KEYWORD) PG_KEYWORD("matched", MATCHED, UNRESERVED_KEYWORD) diff --git a/src/include/tsearch/ts_cache.h b/src/include/tsearch/ts_cache.h index 410f1d54af..4633dd7618 100644 --- a/src/include/tsearch/ts_cache.h +++ b/src/include/tsearch/ts_cache.h @@ -14,6 +14,7 @@ #define TS_CACHE_H #include "utils/guc.h" +#include "catalog/pg_ts_config_map.h" /* @@ -66,6 +67,7 @@ typedef struct { int len; Oid *dictIds; + int32 *dictOptions; } ListDictionary; typedef struct @@ -77,7 +79,7 @@ typedef struct Oid prsId; int lenmap; - ListDictionary *map; + TSMapElement **map; } TSConfigCacheEntry; diff --git a/src/include/tsearch/ts_configmap.h b/src/include/tsearch/ts_configmap.h new file mode 100644 index 0000000000..79e618052e --- /dev/null +++ b/src/include/tsearch/ts_configmap.h @@ -0,0 +1,48 @@ +/*------------------------------------------------------------------------- + * + * ts_configmap.h + * internal representation of text search configuration and utilities for it + * + * Copyright (c) 1998-2018, PostgreSQL Global Development Group + * + * src/include/tsearch/ts_utils.h + * + *------------------------------------------------------------------------- + */ +#ifndef _PG_TS_CONFIGMAP_H_ +#define _PG_TS_CONFIGMAP_H_ + +#include "utils/jsonb.h" +#include "catalog/pg_ts_config_map.h" + +/* + * Configuration storage functions + * Provide interface to convert ts_configuration into JSONB and vice versa + */ + +/* Convert TSMapElement structure into JSONB */ +extern Jsonb *TSMapToJsonb(TSMapElement *config); + +/* Extract TSMapElement from JSONB formated data */ +extern TSMapElement *JsonbToTSMap(Jsonb *json); +/* Replace all occurances of oldDict by newDict */ +extern void TSMapReplaceDictionary(TSMapElement *config, Oid oldDict, Oid newDict); + +/* Move rule list into specified memory context */ +extern TSMapElement *TSMapMoveToMemoryContext(TSMapElement *config, MemoryContext context); +/* Free all nodes of the rule list */ +extern void TSMapElementFree(TSMapElement *element); + +/* Print map in human-readable format */ +extern void TSMapPrintElement(TSMapElement *config, StringInfo result); + +/* Print dictionary name for a given Oid */ +extern void TSMapPrintDictName(Oid dictId, StringInfo result); + +/* Return all dictionaries used in config */ +extern Oid *TSMapGetDictionaries(TSMapElement *config); + +/* Do a deep comparison of two TSMapElements. Doesn't check parents of elements */ +extern bool TSMapElementEquals(TSMapElement *a, TSMapElement *b); + +#endif /* _PG_TS_CONFIGMAP_H_ */ diff --git a/src/include/tsearch/ts_public.h b/src/include/tsearch/ts_public.h index 0b7a5aa68e..d970eec0ab 100644 --- a/src/include/tsearch/ts_public.h +++ b/src/include/tsearch/ts_public.h @@ -115,6 +115,7 @@ typedef struct #define TSL_ADDPOS 0x01 #define TSL_PREFIX 0x02 #define TSL_FILTER 0x04 +#define TSL_MULTI 0x08 /* * Struct for supporting complex dictionaries like thesaurus. diff --git a/src/test/regress/expected/oidjoins.out b/src/test/regress/expected/oidjoins.out index d56c70c847..08c2674d46 100644 --- a/src/test/regress/expected/oidjoins.out +++ b/src/test/regress/expected/oidjoins.out @@ -1089,14 +1089,6 @@ WHERE mapcfg != 0 AND ------+-------- (0 rows) -SELECT ctid, mapdict -FROM pg_catalog.pg_ts_config_map fk -WHERE mapdict != 0 AND - NOT EXISTS(SELECT 1 FROM pg_catalog.pg_ts_dict pk WHERE pk.oid = fk.mapdict); - ctid | mapdict -------+--------- -(0 rows) - SELECT ctid, dictnamespace FROM pg_catalog.pg_ts_dict fk WHERE dictnamespace != 0 AND diff --git a/src/test/regress/expected/tsdicts.out b/src/test/regress/expected/tsdicts.out index 0c1d7c7675..512af5975e 100644 --- a/src/test/regress/expected/tsdicts.out +++ b/src/test/regress/expected/tsdicts.out @@ -420,6 +420,105 @@ SELECT ts_lexize('thesaurus', 'one'); {1} (1 row) +-- test dictionary pipeline in configuration +CREATE TEXT SEARCH CONFIGURATION english_union( + COPY=english +); +ALTER TEXT SEARCH CONFIGURATION english_union ALTER MAPPING FOR + asciiword + WITH english_stem UNION simple; +SELECT to_tsvector('english_union', 'book'); + to_tsvector +------------- + 'book':1 +(1 row) + +SELECT to_tsvector('english_union', 'books'); + to_tsvector +-------------------- + 'book':1 'books':1 +(1 row) + +SELECT to_tsvector('english_union', 'booking'); + to_tsvector +---------------------- + 'book':1 'booking':1 +(1 row) + +CREATE TEXT SEARCH CONFIGURATION english_intersect( + COPY=english +); +ALTER TEXT SEARCH CONFIGURATION english_intersect ALTER MAPPING FOR + asciiword + WITH english_stem INTERSECT simple; +SELECT to_tsvector('english_intersect', 'book'); + to_tsvector +------------- + 'book':1 +(1 row) + +SELECT to_tsvector('english_intersect', 'books'); + to_tsvector +------------- + +(1 row) + +SELECT to_tsvector('english_intersect', 'booking'); + to_tsvector +------------- + +(1 row) + +CREATE TEXT SEARCH CONFIGURATION english_except( + COPY=english +); +ALTER TEXT SEARCH CONFIGURATION english_except ALTER MAPPING FOR + asciiword + WITH simple EXCEPT english_stem; +SELECT to_tsvector('english_except', 'book'); + to_tsvector +------------- + +(1 row) + +SELECT to_tsvector('english_except', 'books'); + to_tsvector +------------- + 'books':1 +(1 row) + +SELECT to_tsvector('english_except', 'booking'); + to_tsvector +------------- + 'booking':1 +(1 row) + +CREATE TEXT SEARCH CONFIGURATION english_branches( + COPY=english +); +ALTER TEXT SEARCH CONFIGURATION english_branches ALTER MAPPING FOR + asciiword + WITH CASE ispell WHEN MATCH THEN KEEP + ELSE english_stem + END; +SELECT to_tsvector('english_branches', 'book'); + to_tsvector +------------- + 'book':1 +(1 row) + +SELECT to_tsvector('english_branches', 'books'); + to_tsvector +------------- + 'book':1 +(1 row) + +SELECT to_tsvector('english_branches', 'booking'); + to_tsvector +---------------------- + 'book':1 'booking':1 +(1 row) + -- Test ispell dictionary in configuration CREATE TEXT SEARCH CONFIGURATION ispell_tst ( COPY=english @@ -580,6 +679,163 @@ SELECT to_tsvector('thesaurus_tst', 'Booking tickets is looking like a booking a 'card':3,10 'invit':2,9 'like':6 'look':5 'order':1,8 (1 row) +ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR asciiword WITH CASE + thesaurus WHEN MATCH THEN KEEP ELSE english_stem +END; +SELECT to_tsvector('thesaurus_tst', 'The Mysterious Rings of Supernova 1987A'); + to_tsvector +--------------------------------------- + '1987a':6 'mysteri':2 'ring':3 'sn':5 +(1 row) + +ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR asciiword WITH thesaurus UNION english_stem; +SELECT to_tsvector('thesaurus_tst', 'The Mysterious Rings of Supernova 1987A'); + to_tsvector +----------------------------------------------------- + '1987a':6 'mysteri':2 'ring':3 'sn':5 'supernova':5 +(1 row) + +ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR asciiword WITH simple UNION thesaurus; +SELECT to_tsvector('thesaurus_tst', 'The Mysterious Rings of Supernova 1987A'); + to_tsvector +------------------------------------------------------------------------ + '1987a':6 'mysterious':2 'of':4 'rings':3 'sn':5 'supernova':5 'the':1 +(1 row) + +ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR asciiword WITH CASE + thesaurus WHEN MATCH THEN simple UNION thesaurus + ELSE simple +END; +\dF+ thesaurus_tst + Text search configuration "public.thesaurus_tst" +Parser: "pg_catalog.default" + Token | Dictionaries +-----------------+------------------------------------------------------- + asciihword | synonym, thesaurus, english_stem + asciiword | CASE thesaurus WHEN MATCH THEN simple UNION thesaurus+ + | ELSE simple + + | END + email | simple + file | simple + float | simple + host | simple + hword | english_stem + hword_asciipart | synonym, thesaurus, english_stem + hword_numpart | simple + hword_part | english_stem + int | simple + numhword | simple + numword | simple + sfloat | simple + uint | simple + url | simple + url_path | simple + version | simple + word | english_stem + +SELECT to_tsvector('thesaurus_tst', 'one two'); + to_tsvector +------------------------ + '12':1 'one':1 'two':2 +(1 row) + +SELECT to_tsvector('thesaurus_tst', 'one two three'); + to_tsvector +----------------------------------- + '123':1 'one':1 'three':3 'two':2 +(1 row) + +SELECT to_tsvector('thesaurus_tst', 'one two four'); + to_tsvector +--------------------------------- + '12':1 'four':3 'one':1 'two':2 +(1 row) + +ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR asciiword WITH CASE + thesaurus WHEN NO MATCH THEN simple ELSE thesaurus +END; +\dF+ thesaurus_tst + Text search configuration "public.thesaurus_tst" +Parser: "pg_catalog.default" + Token | Dictionaries +-----------------+------------------------------------------ + asciihword | synonym, thesaurus, english_stem + asciiword | CASE thesaurus WHEN NO MATCH THEN simple+ + | ELSE thesaurus + + | END + email | simple + file | simple + float | simple + host | simple + hword | english_stem + hword_asciipart | synonym, thesaurus, english_stem + hword_numpart | simple + hword_part | english_stem + int | simple + numhword | simple + numword | simple + sfloat | simple + uint | simple + url | simple + url_path | simple + version | simple + word | english_stem + +SELECT to_tsvector('thesaurus_tst', 'one two'); + to_tsvector +------------- + '12':1 +(1 row) + +SELECT to_tsvector('thesaurus_tst', 'one two three'); + to_tsvector +------------- + '123':1 +(1 row) + +SELECT to_tsvector('thesaurus_tst', 'one two books'); + to_tsvector +------------------ + '12':1 'books':2 +(1 row) + +ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING + REPLACE simple WITH english_stem; +SELECT to_tsvector('thesaurus_tst', 'one two'); + to_tsvector +------------- + '12':1 +(1 row) + +SELECT to_tsvector('thesaurus_tst', 'one two three'); + to_tsvector +------------- + '123':1 +(1 row) + +SELECT to_tsvector('thesaurus_tst', 'one two books'); + to_tsvector +----------------- + '12':1 'book':2 +(1 row) + +CREATE TEXT SEARCH CONFIGURATION operators_tst ( + COPY=thesaurus_tst +); +ALTER TEXT SEARCH CONFIGURATION operators_tst ALTER MAPPING FOR asciiword WITH english_stem UNION simple; +SELECT to_tsvector('operators_tst', 'The Mysterious Rings of Supernova 1987A'); + to_tsvector +-------------------------------------------------------------------------------------- + '1987a':6 'mysteri':2 'mysterious':2 'of':4 'ring':3 'rings':3 'supernova':5 'the':1 +(1 row) + +ALTER TEXT SEARCH CONFIGURATION operators_tst ALTER MAPPING FOR asciiword WITH english_stem UNION (synonym, simple); +SELECT to_tsvector('operators_tst', 'The Mysterious Rings of Supernova 1987A Postgres'); + to_tsvector +----------------------------------------------------------------------------------------------------------- + '1987a':6 'mysteri':2 'mysterious':2 'of':4 'pgsql':7 'postgr':7 'ring':3 'rings':3 'supernova':5 'the':1 +(1 row) + -- invalid: non-lowercase quoted identifiers CREATE TEXT SEARCH DICTIONARY tsdict_case ( diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out index d63fb12f1d..c0e9fc5c8f 100644 --- a/src/test/regress/expected/tsearch.out +++ b/src/test/regress/expected/tsearch.out @@ -36,11 +36,11 @@ WHERE cfgnamespace = 0 OR cfgowner = 0 OR cfgparser = 0; -----+--------- (0 rows) -SELECT mapcfg, maptokentype, mapseqno +SELECT mapcfg, maptokentype FROM pg_ts_config_map -WHERE mapcfg = 0 OR mapdict = 0; - mapcfg | maptokentype | mapseqno ---------+--------------+---------- +WHERE mapcfg = 0; + mapcfg | maptokentype +--------+-------------- (0 rows) -- Look for pg_ts_config_map entries that aren't one of parser's token types @@ -51,8 +51,8 @@ RIGHT JOIN pg_ts_config_map AS m ON (tt.cfgid=m.mapcfg AND tt.tokid=m.maptokentype) WHERE tt.cfgid IS NULL OR tt.tokid IS NULL; - cfgid | tokid | mapcfg | maptokentype | mapseqno | mapdict --------+-------+--------+--------------+----------+--------- + cfgid | tokid | mapcfg | maptokentype | mapdicts +-------+-------+--------+--------------+---------- (0 rows) -- test basic text search behavior without indexes, then with @@ -567,55 +567,55 @@ SELECT length(to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://ae -- ts_debug SELECT * from ts_debug('english', 'abc&nm1;def©ghiõjkl'); - alias | description | token | dictionaries | dictionary | lexemes ------------+-----------------+----------------------------+----------------+--------------+--------- - tag | XML tag | | {} | | - asciiword | Word, all ASCII | abc | {english_stem} | english_stem | {abc} - entity | XML entity | &nm1; | {} | | - asciiword | Word, all ASCII | def | {english_stem} | english_stem | {def} - entity | XML entity | © | {} | | - asciiword | Word, all ASCII | ghi | {english_stem} | english_stem | {ghi} - entity | XML entity | õ | {} | | - asciiword | Word, all ASCII | jkl | {english_stem} | english_stem | {jkl} - tag | XML tag | | {} | | + alias | description | token | dictionaries | configuration | command | lexemes +-----------+-----------------+----------------------------+----------------+---------------+--------------+--------- + tag | XML tag | | {} | | | + asciiword | Word, all ASCII | abc | {english_stem} | english_stem | english_stem | {abc} + entity | XML entity | &nm1; | {} | | | + asciiword | Word, all ASCII | def | {english_stem} | english_stem | english_stem | {def} + entity | XML entity | © | {} | | | + asciiword | Word, all ASCII | ghi | {english_stem} | english_stem | english_stem | {ghi} + entity | XML entity | õ | {} | | | + asciiword | Word, all ASCII | jkl | {english_stem} | english_stem | english_stem | {jkl} + tag | XML tag | | {} | | | (9 rows) -- check parsing of URLs SELECT * from ts_debug('english', 'http://www.harewoodsolutions.co.uk/press.aspx'); - alias | description | token | dictionaries | dictionary | lexemes -----------+---------------+----------------------------------------+--------------+------------+------------------------------------------ - protocol | Protocol head | http:// | {} | | - url | URL | www.harewoodsolutions.co.uk/press.aspx | {simple} | simple | {www.harewoodsolutions.co.uk/press.aspx} - host | Host | www.harewoodsolutions.co.uk | {simple} | simple | {www.harewoodsolutions.co.uk} - url_path | URL path | /press.aspx | {simple} | simple | {/press.aspx} - tag | XML tag | | {} | | + alias | description | token | dictionaries | configuration | command | lexemes +----------+---------------+----------------------------------------+--------------+---------------+---------+------------------------------------------ + protocol | Protocol head | http:// | {} | | | + url | URL | www.harewoodsolutions.co.uk/press.aspx | {simple} | simple | simple | {www.harewoodsolutions.co.uk/press.aspx} + host | Host | www.harewoodsolutions.co.uk | {simple} | simple | simple | {www.harewoodsolutions.co.uk} + url_path | URL path | /press.aspx | {simple} | simple | simple | {/press.aspx} + tag | XML tag | | {} | | | (5 rows) SELECT * from ts_debug('english', 'http://aew.wer0c.ewr/id?ad=qwe&dw'); - alias | description | token | dictionaries | dictionary | lexemes -----------+---------------+----------------------------+--------------+------------+------------------------------ - protocol | Protocol head | http:// | {} | | - url | URL | aew.wer0c.ewr/id?ad=qwe&dw | {simple} | simple | {aew.wer0c.ewr/id?ad=qwe&dw} - host | Host | aew.wer0c.ewr | {simple} | simple | {aew.wer0c.ewr} - url_path | URL path | /id?ad=qwe&dw | {simple} | simple | {/id?ad=qwe&dw} - tag | XML tag | | {} | | + alias | description | token | dictionaries | configuration | command | lexemes +----------+---------------+----------------------------+--------------+---------------+---------+------------------------------ + protocol | Protocol head | http:// | {} | | | + url | URL | aew.wer0c.ewr/id?ad=qwe&dw | {simple} | simple | simple | {aew.wer0c.ewr/id?ad=qwe&dw} + host | Host | aew.wer0c.ewr | {simple} | simple | simple | {aew.wer0c.ewr} + url_path | URL path | /id?ad=qwe&dw | {simple} | simple | simple | {/id?ad=qwe&dw} + tag | XML tag | | {} | | | (5 rows) SELECT * from ts_debug('english', 'http://5aew.werc.ewr:8100/?'); - alias | description | token | dictionaries | dictionary | lexemes -----------+---------------+----------------------+--------------+------------+------------------------ - protocol | Protocol head | http:// | {} | | - url | URL | 5aew.werc.ewr:8100/? | {simple} | simple | {5aew.werc.ewr:8100/?} - host | Host | 5aew.werc.ewr:8100 | {simple} | simple | {5aew.werc.ewr:8100} - url_path | URL path | /? | {simple} | simple | {/?} + alias | description | token | dictionaries | configuration | command | lexemes +----------+---------------+----------------------+--------------+---------------+---------+------------------------ + protocol | Protocol head | http:// | {} | | | + url | URL | 5aew.werc.ewr:8100/? | {simple} | simple | simple | {5aew.werc.ewr:8100/?} + host | Host | 5aew.werc.ewr:8100 | {simple} | simple | simple | {5aew.werc.ewr:8100} + url_path | URL path | /? | {simple} | simple | simple | {/?} (4 rows) SELECT * from ts_debug('english', '5aew.werc.ewr:8100/?xx'); - alias | description | token | dictionaries | dictionary | lexemes -----------+-------------+------------------------+--------------+------------+-------------------------- - url | URL | 5aew.werc.ewr:8100/?xx | {simple} | simple | {5aew.werc.ewr:8100/?xx} - host | Host | 5aew.werc.ewr:8100 | {simple} | simple | {5aew.werc.ewr:8100} - url_path | URL path | /?xx | {simple} | simple | {/?xx} + alias | description | token | dictionaries | configuration | command | lexemes +----------+-------------+------------------------+--------------+---------------+---------+-------------------------- + url | URL | 5aew.werc.ewr:8100/?xx | {simple} | simple | simple | {5aew.werc.ewr:8100/?xx} + host | Host | 5aew.werc.ewr:8100 | {simple} | simple | simple | {5aew.werc.ewr:8100} + url_path | URL path | /?xx | {simple} | simple | simple | {/?xx} (3 rows) SELECT token, alias, diff --git a/src/test/regress/sql/oidjoins.sql b/src/test/regress/sql/oidjoins.sql index 656cace451..4e6730fa69 100644 --- a/src/test/regress/sql/oidjoins.sql +++ b/src/test/regress/sql/oidjoins.sql @@ -545,10 +545,6 @@ SELECT ctid, mapcfg FROM pg_catalog.pg_ts_config_map fk WHERE mapcfg != 0 AND NOT EXISTS(SELECT 1 FROM pg_catalog.pg_ts_config pk WHERE pk.oid = fk.mapcfg); -SELECT ctid, mapdict -FROM pg_catalog.pg_ts_config_map fk -WHERE mapdict != 0 AND - NOT EXISTS(SELECT 1 FROM pg_catalog.pg_ts_dict pk WHERE pk.oid = fk.mapdict); SELECT ctid, dictnamespace FROM pg_catalog.pg_ts_dict fk WHERE dictnamespace != 0 AND diff --git a/src/test/regress/sql/tsdicts.sql b/src/test/regress/sql/tsdicts.sql index 1633c0d066..080ddc486a 100644 --- a/src/test/regress/sql/tsdicts.sql +++ b/src/test/regress/sql/tsdicts.sql @@ -117,6 +117,57 @@ CREATE TEXT SEARCH DICTIONARY thesaurus ( SELECT ts_lexize('thesaurus', 'one'); +-- test dictionary pipeline in configuration +CREATE TEXT SEARCH CONFIGURATION english_union( + COPY=english +); + +ALTER TEXT SEARCH CONFIGURATION english_union ALTER MAPPING FOR + asciiword + WITH english_stem UNION simple; + +SELECT to_tsvector('english_union', 'book'); +SELECT to_tsvector('english_union', 'books'); +SELECT to_tsvector('english_union', 'booking'); + +CREATE TEXT SEARCH CONFIGURATION english_intersect( + COPY=english +); + +ALTER TEXT SEARCH CONFIGURATION english_intersect ALTER MAPPING FOR + asciiword + WITH english_stem INTERSECT simple; + +SELECT to_tsvector('english_intersect', 'book'); +SELECT to_tsvector('english_intersect', 'books'); +SELECT to_tsvector('english_intersect', 'booking'); + +CREATE TEXT SEARCH CONFIGURATION english_except( + COPY=english +); + +ALTER TEXT SEARCH CONFIGURATION english_except ALTER MAPPING FOR + asciiword + WITH simple EXCEPT english_stem; + +SELECT to_tsvector('english_except', 'book'); +SELECT to_tsvector('english_except', 'books'); +SELECT to_tsvector('english_except', 'booking'); + +CREATE TEXT SEARCH CONFIGURATION english_branches( + COPY=english +); + +ALTER TEXT SEARCH CONFIGURATION english_branches ALTER MAPPING FOR + asciiword + WITH CASE ispell WHEN MATCH THEN KEEP + ELSE english_stem + END; + +SELECT to_tsvector('english_branches', 'book'); +SELECT to_tsvector('english_branches', 'books'); +SELECT to_tsvector('english_branches', 'booking'); + -- Test ispell dictionary in configuration CREATE TEXT SEARCH CONFIGURATION ispell_tst ( COPY=english @@ -189,6 +240,50 @@ SELECT to_tsvector('thesaurus_tst', 'one postgres one two one two three one'); SELECT to_tsvector('thesaurus_tst', 'Supernovae star is very new star and usually called supernovae (abbreviation SN)'); SELECT to_tsvector('thesaurus_tst', 'Booking tickets is looking like a booking a tickets'); +ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR asciiword WITH CASE + thesaurus WHEN MATCH THEN KEEP ELSE english_stem +END; +SELECT to_tsvector('thesaurus_tst', 'The Mysterious Rings of Supernova 1987A'); + +ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR asciiword WITH thesaurus UNION english_stem; +SELECT to_tsvector('thesaurus_tst', 'The Mysterious Rings of Supernova 1987A'); + +ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR asciiword WITH simple UNION thesaurus; +SELECT to_tsvector('thesaurus_tst', 'The Mysterious Rings of Supernova 1987A'); + +ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR asciiword WITH CASE + thesaurus WHEN MATCH THEN simple UNION thesaurus + ELSE simple +END; +\dF+ thesaurus_tst +SELECT to_tsvector('thesaurus_tst', 'one two'); +SELECT to_tsvector('thesaurus_tst', 'one two three'); +SELECT to_tsvector('thesaurus_tst', 'one two four'); + +ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR asciiword WITH CASE + thesaurus WHEN NO MATCH THEN simple ELSE thesaurus +END; +\dF+ thesaurus_tst +SELECT to_tsvector('thesaurus_tst', 'one two'); +SELECT to_tsvector('thesaurus_tst', 'one two three'); +SELECT to_tsvector('thesaurus_tst', 'one two books'); + +ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING + REPLACE simple WITH english_stem; +SELECT to_tsvector('thesaurus_tst', 'one two'); +SELECT to_tsvector('thesaurus_tst', 'one two three'); +SELECT to_tsvector('thesaurus_tst', 'one two books'); + +CREATE TEXT SEARCH CONFIGURATION operators_tst ( + COPY=thesaurus_tst +); + +ALTER TEXT SEARCH CONFIGURATION operators_tst ALTER MAPPING FOR asciiword WITH english_stem UNION simple; +SELECT to_tsvector('operators_tst', 'The Mysterious Rings of Supernova 1987A'); + +ALTER TEXT SEARCH CONFIGURATION operators_tst ALTER MAPPING FOR asciiword WITH english_stem UNION (synonym, simple); +SELECT to_tsvector('operators_tst', 'The Mysterious Rings of Supernova 1987A Postgres'); + -- invalid: non-lowercase quoted identifiers CREATE TEXT SEARCH DICTIONARY tsdict_case ( diff --git a/src/test/regress/sql/tsearch.sql b/src/test/regress/sql/tsearch.sql index 1c8520b3e9..6f8af63c1a 100644 --- a/src/test/regress/sql/tsearch.sql +++ b/src/test/regress/sql/tsearch.sql @@ -26,9 +26,9 @@ SELECT oid, cfgname FROM pg_ts_config WHERE cfgnamespace = 0 OR cfgowner = 0 OR cfgparser = 0; -SELECT mapcfg, maptokentype, mapseqno +SELECT mapcfg, maptokentype FROM pg_ts_config_map -WHERE mapcfg = 0 OR mapdict = 0; +WHERE mapcfg = 0; -- Look for pg_ts_config_map entries that aren't one of parser's token types SELECT * FROM