A
(1 row)
+SELECT unaccent('℃℉'); -- degree signs
+ unaccent
+----------
+ °C°F
+(1 row)
+
+SELECT unaccent('℗'); -- sound recording copyright
+ unaccent
+----------
+ (P)
+(1 row)
+
SELECT unaccent('unaccent', 'foobar');
unaccent
----------
A
(1 row)
+SELECT unaccent('unaccent', '℃℉');
+ unaccent
+----------
+ °C°F
+(1 row)
+
+SELECT unaccent('unaccent', '℗');
+ unaccent
+----------
+ (P)
+(1 row)
+
SELECT ts_lexize('unaccent', 'foobar');
ts_lexize
-----------
{A}
(1 row)
+SELECT ts_lexize('unaccent', '℃℉');
+ ts_lexize
+-----------
+ {°C°F}
+(1 row)
+
+SELECT ts_lexize('unaccent', '℗');
+ ts_lexize
+-----------
+ {(P)}
+(1 row)
+
+-- Controversial case. Black-Letter Capital H (U+210C) is translated by
+-- Latin-ASCII.xml as 'x', but it should be 'H'.
+SELECT unaccent('ℌ');
+ unaccent
+----------
+ x
+(1 row)
+
return table[codepoint.combining_ids[0]]
# Should not come here
- assert(False)
+ assert False, 'Codepoint U+%0.2X' % codepoint.id
elif is_plain_letter(codepoint):
return codepoint
# Should not come here
- assert(False)
+ assert False, 'Codepoint U+%0.2X' % codepoint.id
def is_ligature(codepoint, table):
# Symbols of "Letterlike Symbols" Unicode Block (U+2100 to U+214F)
charactersSet.add((0x2103, "\xb0C")) # DEGREE CELSIUS
charactersSet.add((0x2109, "\xb0F")) # DEGREE FAHRENHEIT
- charactersSet.add((0x2117, "(P)")) # SOUND RECORDING COPYRIGHT
return charactersSet
SELECT unaccent('ЁЖИК');
SELECT unaccent('˃˖˗˜');
SELECT unaccent('À'); -- Remove combining diacritical 0x0300
+SELECT unaccent('℃℉'); -- degree signs
+SELECT unaccent('℗'); -- sound recording copyright
SELECT unaccent('unaccent', 'foobar');
SELECT unaccent('unaccent', 'ёлка');
SELECT unaccent('unaccent', 'ЁЖИК');
SELECT unaccent('unaccent', '˃˖˗˜');
SELECT unaccent('unaccent', 'À');
+SELECT unaccent('unaccent', '℃℉');
+SELECT unaccent('unaccent', '℗');
SELECT ts_lexize('unaccent', 'foobar');
SELECT ts_lexize('unaccent', 'ёлка');
SELECT ts_lexize('unaccent', 'ЁЖИК');
SELECT ts_lexize('unaccent', '˃˖˗˜');
SELECT ts_lexize('unaccent', 'À');
+SELECT ts_lexize('unaccent', '℃℉');
+SELECT ts_lexize('unaccent', '℗');
+
+-- Controversial case. Black-Letter Capital H (U+210C) is translated by
+-- Latin-ASCII.xml as 'x', but it should be 'H'.
+SELECT unaccent('ℌ');