Add combining characters to unaccent.rules.

author Thomas Munro <[email protected]>

Fri, 1 Feb 2019 14:23:01 +0000 (15:23 +0100)

committer Thomas Munro <[email protected]>

Fri, 1 Feb 2019 14:23:01 +0000 (15:23 +0100)
author Thomas Munro <[email protected]>
Fri, 1 Feb 2019 14:23:01 +0000 (15:23 +0100)
committer Thomas Munro <[email protected]>
Fri, 1 Feb 2019 14:23:01 +0000 (15:23 +0100)
diff --git a/contrib/unaccent/expected/unaccent.out b/contrib/unaccent/expected/unaccent.out

index 69c2cf9bd7ab0af4656c20de86c9f00e09f6c478..c1bd7cd897df05eb0914c015e2d71ae230655783 100644 (file)
--- a/contrib/unaccent/expected/unaccent.out
+++ b/contrib/unaccent/expected/unaccent.out
@@ -31,6 +31,12 @@ SELECT unaccent('˃˖˗˜');
   >+-~
  (1 row)
  
+SELECT unaccent('À');  -- Remove combining diacritical 0x0300
+ unaccent 
+----------
+ A
+(1 row)
+
  SELECT unaccent('unaccent', 'foobar');
   unaccent 
  ----------
@@ -55,6 +61,12 @@ SELECT unaccent('unaccent', '˃˖˗˜');
   >+-~
  (1 row)
  
+SELECT unaccent('unaccent', 'À');
+ unaccent 
+----------
+ A
+(1 row)
+
  SELECT ts_lexize('unaccent', 'foobar');
   ts_lexize 
  -----------
@@ -79,3 +91,9 @@ SELECT ts_lexize('unaccent', '˃˖˗˜');
   {>+-~}
  (1 row)
  
+SELECT ts_lexize('unaccent', 'À');
+ ts_lexize 
+-----------
+ {A}
+(1 row)
+
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py

index 4419a771edf919d20a317a9383c5f6ff3fa9b0ff..58b6e7deb74c4dc62e5cd331dd036eb996e6a7f0 100644 (file)
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -61,8 +61,25 @@ PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case
                         (0x03b1, 0x03c9),     # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA
                         (0x0391, 0x03a9))     # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
  
+# Combining marks follow a "base" character, and result in a composite
+# character. Example: "U&'A\0300'"produces "À".There are three types of
+# combining marks: enclosing (Me), non-spacing combining (Mn), spacing
+# combining (Mc). We identify the ranges of marks we feel safe removing.
+# References:
+#   https://en.wikipedia.org/wiki/Combining_character
+#   https://www.unicode.org/charts/PDF/U0300.pdf
+#   https://www.unicode.org/charts/PDF/U20D0.pdf
+COMBINING_MARK_RANGES = ((0x0300, 0x0362),  # Mn: Accents, IPA
+                         (0x20dd, 0x20E0),  # Me: Symbols
+                         (0x20e2, 0x20e4),) # Me: Screen, keycap, triangle
+
  def print_record(codepoint, letter):
-    print (chr(codepoint) + "\t" + letter)
+    if letter:
+        output = chr(codepoint) + "\t" + letter
+    else:
+        output = chr(codepoint)
+
+    print(output)
  
  class Codepoint:
      def __init__(self, id, general_category, combining_ids):
@@ -70,6 +87,16 @@ class Codepoint:
          self.general_category = general_category
          self.combining_ids = combining_ids
  
+def is_mark_to_remove(codepoint):
+    """Return true if this is a combining mark to remove."""
+    if not is_mark(codepoint):
+        return False
+
+    for begin, end in COMBINING_MARK_RANGES:
+        if codepoint.id >= begin and codepoint.id <= end:
+            return True
+    return False
+
  def is_plain_letter(codepoint):
      """Return true if codepoint represents a "plain letter"."""
      for begin, end in PLAIN_LETTER_RANGES:
@@ -234,6 +261,8 @@ def main(args):
                               "".join(chr(combining_codepoint.id)
                                       for combining_codepoint \
                                       in get_plain_letters(codepoint, table))))
+        elif is_mark_to_remove(codepoint):
+            charactersSet.add((codepoint.id, None))
  
      # add CLDR Latin-ASCII characters
      if not args.noLigaturesExpansion:
diff --git a/contrib/unaccent/sql/unaccent.sql b/contrib/unaccent/sql/unaccent.sql

index c671827caa55a634bfa9aa5752108433636b1a7e..2ae097ff2b86171b2255b30d44ee39bdd9f1f66c 100644 (file)
--- a/contrib/unaccent/sql/unaccent.sql
+++ b/contrib/unaccent/sql/unaccent.sql
@@ -9,13 +9,16 @@ SELECT unaccent('foobar');
  SELECT unaccent('ёлка');
  SELECT unaccent('ЁЖИК');
  SELECT unaccent('˃˖˗˜');
+SELECT unaccent('À');  -- Remove combining diacritical 0x0300
  
  SELECT unaccent('unaccent', 'foobar');
  SELECT unaccent('unaccent', 'ёлка');
  SELECT unaccent('unaccent', 'ЁЖИК');
  SELECT unaccent('unaccent', '˃˖˗˜');
+SELECT unaccent('unaccent', 'À');
  
  SELECT ts_lexize('unaccent', 'foobar');
  SELECT ts_lexize('unaccent', 'ёлка');
  SELECT ts_lexize('unaccent', 'ЁЖИК');
  SELECT ts_lexize('unaccent', '˃˖˗˜');
+SELECT ts_lexize('unaccent', 'À');
diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules

index 7ce25eef03d61668e0d631ed0cb1d4391fc402a9..99826408ac14560e4970a46f4e424c819cab158b 100644 (file)
--- a/contrib/unaccent/unaccent.rules
+++ b/contrib/unaccent/unaccent.rules
@@ -414,6 +414,105 @@
  ˖ +
  ˗ -
  ˜ ~
+̀
+́
+̂
+̃
+̄
+̅
+̆
+̇
+̈
+̉
+̊
+̋
+̌
+̍
+̎
+̏
+̐
+̑
+̒
+̓
+̔
+̕
+̖
+̗
+̘
+̙
+̚
+̛
+̜
+̝
+̞
+̟
+̠
+̡
+̢
+̣
+̤
+̥
+̦
+̧
+̨
+̩
+̪
+̫
+̬
+̭
+̮
+̯
+̰
+̱
+̲
+̳
+̴
+̵
+̶
+̷
+̸
+̹
+̺
+̻
+̼
+̽
+̾
+̿
+̀
+́
+͂
+̓
+̈́
+ͅ
+͆
+͇
+͈
+͉
+͊
+͋
+͌
+͍
+͎
+͏
+͐
+͑
+͒
+͓
+͔
+͕
+͖
+͗
+͘
+͙
+͚
+͛
+͜
+͝
+͞
+͟
+͠
+͡
+͢
  Ά Α
  Έ Ε
  Ή Η
@@ -982,6 +1081,13 @@
  ₧    Pts
  ₹    Rs
  ₺    TL
+⃝
+⃞
+⃟
+⃠
+⃢
+⃣
+⃤
  ℀    a/c
  ℁    a/s
  ℂ    C
author	Thomas Munro <[email protected]>
	Fri, 1 Feb 2019 14:23:01 +0000 (15:23 +0100)
committer	Thomas Munro <[email protected]>
	Fri, 1 Feb 2019 14:23:01 +0000 (15:23 +0100)
contrib/unaccent/expected/unaccent.out		patch \| blob \| blame \| history
contrib/unaccent/generate_unaccent_rules.py		patch \| blob \| blame \| history
contrib/unaccent/sql/unaccent.sql		patch \| blob \| blame \| history
contrib/unaccent/unaccent.rules		patch \| blob \| blame \| history