Fix assorted bugs in contrib/unaccent's configuration file parsing.
authorTom Lane <[email protected]>
Mon, 7 Nov 2011 16:48:53 +0000 (11:48 -0500)
committerTom Lane <[email protected]>
Mon, 7 Nov 2011 16:50:18 +0000 (11:50 -0500)
Make it use t_isspace() to identify whitespace, rather than relying on
sscanf which is known to get it wrong on some platform/locale combinations.
Get rid of fixed-size buffers.  Make it actually continue to parse the file
after ignoring a line with untranslatable characters, as was obviously
intended.

The first of these issues is per gripe from J Smith, though not exactly
either of his proposed patches.

contrib/unaccent/unaccent.c

index d9c2eac2e74833c666939d4b74db183dbb5863be..d22f5c7beaa875451539b14983abe2f142b619eb 100644 (file)
@@ -91,35 +91,83 @@ initSuffixTree(char *filename)
 
    do
    {
-       char        src[4096];
-       char        trg[4096];
-       int         srclen;
-       int         trglen;
-       char       *line = NULL;
-
+       /*
+        * pg_do_encoding_conversion() (called by tsearch_readline()) will
+        * emit exception if it finds untranslatable characters in current
+        * locale. We just skip such lines, continuing with the next.
+        */
        skip = true;
 
        PG_TRY();
        {
-           /*
-            * pg_do_encoding_conversion() (called by tsearch_readline()) will
-            * emit exception if it finds untranslatable characters in current
-            * locale. We just skip such characters.
-            */
+           char       *line;
+
            while ((line = tsearch_readline(&trst)) != NULL)
            {
-               if (sscanf(line, "%s\t%s\n", src, trg) != 2)
-                   continue;
+               /*
+                * The format of each line must be "src trg" where src and trg
+                * are sequences of one or more non-whitespace characters,
+                * separated by whitespace.  Whitespace at start or end of
+                * line is ignored.
+                */
+               int         state;
+               char       *ptr;
+               char       *src = NULL;
+               char       *trg = NULL;
+               int         ptrlen;
+               int         srclen = 0;
+               int         trglen = 0;
+
+               state = 0;
+               for (ptr = line; *ptr; ptr += ptrlen)
+               {
+                   ptrlen = pg_mblen(ptr);
+                   /* ignore whitespace, but end src or trg */
+                   if (t_isspace(ptr))
+                   {
+                       if (state == 1)
+                           state = 2;
+                       else if (state == 3)
+                           state = 4;
+                       continue;
+                   }
+                   switch (state)
+                   {
+                       case 0:
+                           /* start of src */
+                           src = ptr;
+                           srclen = ptrlen;
+                           state = 1;
+                           break;
+                       case 1:
+                           /* continue src */
+                           srclen += ptrlen;
+                           break;
+                       case 2:
+                           /* start of trg */
+                           trg = ptr;
+                           trglen = ptrlen;
+                           state = 3;
+                           break;
+                       case 3:
+                           /* continue trg */
+                           trglen += ptrlen;
+                           break;
+                       default:
+                           /* bogus line format */
+                           state = -1;
+                           break;
+                   }
+               }
 
-               srclen = strlen(src);
-               trglen = strlen(trg);
+               if (state >= 3)
+                   rootSuffixTree = placeChar(rootSuffixTree,
+                                              (unsigned char *) src, srclen,
+                                              trg, trglen);
 
-               rootSuffixTree = placeChar(rootSuffixTree,
-                                          (unsigned char *) src, srclen,
-                                          trg, trglen);
-               skip = false;
                pfree(line);
            }
+           skip = false;
        }
        PG_CATCH();
        {