From bb428135490caafe23562e3dcd9925d7d7c5a7be Mon Sep 17 00:00:00 2001 From: Jehan-Guillaume de Rorthais Date: Thu, 11 Jun 2020 18:08:31 +0200 Subject: [PATCH] Replace buggy ucol_strcoll* funcs with ucol_strcollIter Functions ucol_strcoll and ucol_strcollUTF8 are breaking some collation rules. This leads to wrong sort order or wrong result from index using such collations. See bug report #15285 for details: http://postgr.es/m/153201618542.1404.3611626898935613264%40wrigleys.postgresql.org This fix replace ucol_strcoll* functions with ucol_strcollIter() which is not affected by this bug. Reported-by: Roman Lytovchenko Analysed-by: Peter Geoghegan Author: Jehan-Guillaume de Rorthais --- src/backend/utils/adt/varlena.c | 54 ++++++++++++------- src/include/utils/pg_locale.h | 14 ----- .../regress/expected/collate.icu.utf8.out | 12 +++++ src/test/regress/sql/collate.icu.utf8.sql | 8 +++ 4 files changed, 56 insertions(+), 32 deletions(-) diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 2eaabd6231..f156c00a1a 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -1638,34 +1638,43 @@ varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid) if (mylocale->provider == COLLPROVIDER_ICU) { #ifdef USE_ICU -#ifdef HAVE_UCOL_STRCOLLUTF8 if (GetDatabaseEncoding() == PG_UTF8) { UErrorCode status; + UCharIterator iter1, + iter2; status = U_ZERO_ERROR; - result = ucol_strcollUTF8(mylocale->info.icu.ucol, - arg1, len1, - arg2, len2, - &status); + + uiter_setUTF8(&iter1, arg1, len1); + uiter_setUTF8(&iter2, arg2, len2); + + result = ucol_strcollIter(mylocale->info.icu.ucol, + &iter1, &iter2, &status); if (U_FAILURE(status)) ereport(ERROR, (errmsg("collation failed: %s", u_errorName(status)))); } else -#endif { + UErrorCode status; + UCharIterator iter1, + iter2; int32_t ulen1, ulen2; UChar *uchar1, *uchar2; + status = U_ZERO_ERROR; + ulen1 = icu_to_uchar(&uchar1, arg1, len1); ulen2 = icu_to_uchar(&uchar2, arg2, len2); - result = ucol_strcoll(mylocale->info.icu.ucol, - uchar1, ulen1, - uchar2, ulen2); + uiter_setString(&iter1, uchar1, ulen1); + uiter_setString(&iter2, uchar2, ulen2); + + result = ucol_strcollIter(mylocale->info.icu.ucol, + &iter1, &iter2, &status); pfree(uchar1); pfree(uchar2); @@ -2352,34 +2361,43 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup) if (sss->locale->provider == COLLPROVIDER_ICU) { #ifdef USE_ICU -#ifdef HAVE_UCOL_STRCOLLUTF8 if (GetDatabaseEncoding() == PG_UTF8) { UErrorCode status; + UCharIterator iter1, + iter2; status = U_ZERO_ERROR; - result = ucol_strcollUTF8(sss->locale->info.icu.ucol, - a1p, len1, - a2p, len2, - &status); + + uiter_setUTF8(&iter1, a1p, len1); + uiter_setUTF8(&iter2, a2p, len2); + + result = ucol_strcollIter(sss->locale->info.icu.ucol, + &iter1, &iter2, &status); if (U_FAILURE(status)) ereport(ERROR, (errmsg("collation failed: %s", u_errorName(status)))); } else -#endif { + UErrorCode status; + UCharIterator iter1, + iter2; int32_t ulen1, ulen2; UChar *uchar1, *uchar2; + status = U_ZERO_ERROR; + ulen1 = icu_to_uchar(&uchar1, a1p, len1); ulen2 = icu_to_uchar(&uchar2, a2p, len2); - result = ucol_strcoll(sss->locale->info.icu.ucol, - uchar1, ulen1, - uchar2, ulen2); + uiter_setString(&iter1, uchar1, ulen1); + uiter_setString(&iter2, uchar2, ulen2); + + result = ucol_strcollIter(sss->locale->info.icu.ucol, + &iter1, &iter2, &status); pfree(uchar1); pfree(uchar2); diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 9cb7d91ddf..2b3ec2c597 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -21,20 +21,6 @@ #include "utils/guc.h" -#ifdef USE_ICU -/* - * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53. - * (see - * ) - */ -#if U_ICU_VERSION_MAJOR_NUM >= 53 -#define HAVE_UCOL_STRCOLLUTF8 1 -#else -#undef HAVE_UCOL_STRCOLLUTF8 -#endif -#endif - - /* GUC settings */ extern char *locale_messages; extern char *locale_monetary; diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out index 2b86ce9028..06cdb948eb 100644 --- a/src/test/regress/expected/collate.icu.utf8.out +++ b/src/test/regress/expected/collate.icu.utf8.out @@ -1900,6 +1900,18 @@ SELECT (SELECT count(*) FROM test33_0) <> (SELECT count(*) FROM test33_1); t (1 row) +CREATE COLLATION digitslast (provider = icu, locale = '@colReorder=latn-digit'); +CREATE TABLE test34 (b CHAR(4) NOT NULL COLLATE digitslast); +INSERT INTO test34 VALUES ('0000'), ('0001'), ('ABCD'); +CREATE INDEX ON test34(b); +SET enable_seqscan TO off; +SELECT * FROM test34 WHERE b = '0000' ; + b +------ + 0000 +(1 row) + +RESET enable_seqscan; -- cleanup RESET search_path; SET client_min_messages TO warning; diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql index 67de7d9794..50d0be70a8 100644 --- a/src/test/regress/sql/collate.icu.utf8.sql +++ b/src/test/regress/sql/collate.icu.utf8.sql @@ -722,6 +722,14 @@ INSERT INTO test33 VALUES (2, 'DEF'); SELECT (SELECT count(*) FROM test33_0) <> (SELECT count(*) FROM test33_1); +CREATE COLLATION digitslast (provider = icu, locale = '@colReorder=latn-digit'); +CREATE TABLE test34 (b CHAR(4) NOT NULL COLLATE digitslast); +INSERT INTO test34 VALUES ('0000'), ('0001'), ('ABCD'); +CREATE INDEX ON test34(b); +SET enable_seqscan TO off; +SELECT * FROM test34 WHERE b = '0000' ; +RESET enable_seqscan; + -- cleanup RESET search_path; SET client_min_messages TO warning; -- 2.20.1