Skip to content

Commit 5e8d670

Browse files
committed
Add Greek characters to unaccent.rules.
Author: Tasos Maschalidis Reviewed-by: Michael Paquier, Tom Lane Discussion: https://postgr.es/m/153495048900.1368.11566580687623014380%40wrigleys.postgresql.org Discussion: https://postgr.es/m/VI1PR01MB38537EBD529FE5EE3FE9A5FEB5370%40VI1PR01MB3853.eurprd01.prod.exchangelabs.com
1 parent ec74369 commit 5e8d670

File tree

2 files changed

+236
-4
lines changed

2 files changed

+236
-4
lines changed

contrib/unaccent/generate_unaccent_rules.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,15 @@
2929
import sys
3030
import xml.etree.ElementTree as ET
3131

32+
# The ranges of Unicode characters that we consider to be "plain letters".
33+
# For now we are being conservative by including only Latin and Greek. This
34+
# could be extended in future based on feedback from people with relevant
35+
# language knowledge.
36+
PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case
37+
(ord('A'), ord('Z')), # Latin upper case
38+
(0x03b1, 0x03c9), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA
39+
(0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
40+
3241
def print_record(codepoint, letter):
3342
print (unichr(codepoint) + "\t" + letter).encode("UTF-8")
3443

@@ -39,9 +48,11 @@ def __init__(self, id, general_category, combining_ids):
3948
self.combining_ids = combining_ids
4049

4150
def is_plain_letter(codepoint):
42-
"""Return true if codepoint represents a plain ASCII letter."""
43-
return (codepoint.id >= ord('a') and codepoint.id <= ord('z')) or \
44-
(codepoint.id >= ord('A') and codepoint.id <= ord('Z'))
51+
"""Return true if codepoint represents a "plain letter"."""
52+
for begin, end in PLAIN_LETTER_RANGES:
53+
if codepoint.id >= begin and codepoint.id <= end:
54+
return True
55+
return False
4556

4657
def is_mark(codepoint):
4758
"""Returns true for diacritical marks (combining codepoints)."""
@@ -184,7 +195,7 @@ def main(args):
184195
len(codepoint.combining_ids) > 1:
185196
if is_letter_with_marks(codepoint, table):
186197
charactersSet.add((codepoint.id,
187-
chr(get_plain_letter(codepoint, table).id)))
198+
unichr(get_plain_letter(codepoint, table).id)))
188199
elif args.noLigaturesExpansion is False and is_ligature(codepoint, table):
189200
charactersSet.add((codepoint.id,
190201
"".join(unichr(combining_codepoint.id)

contrib/unaccent/unaccent.rules

Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,26 @@
399399
ʦ ts
400400
ʪ ls
401401
ʫ lz
402+
Ά Α
403+
Έ Ε
404+
Ή Η
405+
Ί Ι
406+
Ό Ο
407+
Ύ Υ
408+
Ώ Ω
409+
ΐ ι
410+
Ϊ Ι
411+
Ϋ Υ
412+
ά α
413+
έ ε
414+
ή η
415+
ί ι
416+
ΰ υ
417+
ϊ ι
418+
ϋ υ
419+
ό ο
420+
ύ υ
421+
ώ ω
402422
Ё Е
403423
ё е
404424
ᴀ A
@@ -709,6 +729,207 @@
709729
ỽ v
710730
Ỿ Y
711731
ỿ y
732+
ἀ α
733+
ἁ α
734+
ἂ α
735+
ἃ α
736+
ἄ α
737+
ἅ α
738+
ἆ α
739+
ἇ α
740+
Ἀ Α
741+
Ἁ Α
742+
Ἂ Α
743+
Ἃ Α
744+
Ἄ Α
745+
Ἅ Α
746+
Ἆ Α
747+
Ἇ Α
748+
ἐ ε
749+
ἑ ε
750+
ἒ ε
751+
ἓ ε
752+
ἔ ε
753+
ἕ ε
754+
Ἐ Ε
755+
Ἑ Ε
756+
Ἒ Ε
757+
Ἓ Ε
758+
Ἔ Ε
759+
Ἕ Ε
760+
ἠ η
761+
ἡ η
762+
ἢ η
763+
ἣ η
764+
ἤ η
765+
ἥ η
766+
ἦ η
767+
ἧ η
768+
Ἠ Η
769+
Ἡ Η
770+
Ἢ Η
771+
Ἣ Η
772+
Ἤ Η
773+
Ἥ Η
774+
Ἦ Η
775+
Ἧ Η
776+
ἰ ι
777+
ἱ ι
778+
ἲ ι
779+
ἳ ι
780+
ἴ ι
781+
ἵ ι
782+
ἶ ι
783+
ἷ ι
784+
Ἰ Ι
785+
Ἱ Ι
786+
Ἲ Ι
787+
Ἳ Ι
788+
Ἴ Ι
789+
Ἵ Ι
790+
Ἶ Ι
791+
Ἷ Ι
792+
ὀ ο
793+
ὁ ο
794+
ὂ ο
795+
ὃ ο
796+
ὄ ο
797+
ὅ ο
798+
Ὀ Ο
799+
Ὁ Ο
800+
Ὂ Ο
801+
Ὃ Ο
802+
Ὄ Ο
803+
Ὅ Ο
804+
ὐ υ
805+
ὑ υ
806+
ὒ υ
807+
ὓ υ
808+
ὔ υ
809+
ὕ υ
810+
ὖ υ
811+
ὗ υ
812+
Ὑ Υ
813+
Ὓ Υ
814+
Ὕ Υ
815+
Ὗ Υ
816+
ὠ ω
817+
ὡ ω
818+
ὢ ω
819+
ὣ ω
820+
ὤ ω
821+
ὥ ω
822+
ὦ ω
823+
ὧ ω
824+
Ὠ Ω
825+
Ὡ Ω
826+
Ὢ Ω
827+
Ὣ Ω
828+
Ὤ Ω
829+
Ὥ Ω
830+
Ὦ Ω
831+
Ὧ Ω
832+
ὰ α
833+
ὲ ε
834+
ὴ η
835+
ὶ ι
836+
ὸ ο
837+
ὺ υ
838+
ὼ ω
839+
ᾀ α
840+
ᾁ α
841+
ᾂ α
842+
ᾃ α
843+
ᾄ α
844+
ᾅ α
845+
ᾆ α
846+
ᾇ α
847+
ᾈ Α
848+
ᾉ Α
849+
ᾊ Α
850+
ᾋ Α
851+
ᾌ Α
852+
ᾍ Α
853+
ᾎ Α
854+
ᾏ Α
855+
ᾐ η
856+
ᾑ η
857+
ᾒ η
858+
ᾓ η
859+
ᾔ η
860+
ᾕ η
861+
ᾖ η
862+
ᾗ η
863+
ᾘ Η
864+
ᾙ Η
865+
ᾚ Η
866+
ᾛ Η
867+
ᾜ Η
868+
ᾝ Η
869+
ᾞ Η
870+
ᾟ Η
871+
ᾠ ω
872+
ᾡ ω
873+
ᾢ ω
874+
ᾣ ω
875+
ᾤ ω
876+
ᾥ ω
877+
ᾦ ω
878+
ᾧ ω
879+
ᾨ Ω
880+
ᾩ Ω
881+
ᾪ Ω
882+
ᾫ Ω
883+
ᾬ Ω
884+
ᾭ Ω
885+
ᾮ Ω
886+
ᾯ Ω
887+
ᾰ α
888+
ᾱ α
889+
ᾲ α
890+
ᾳ α
891+
ᾴ α
892+
ᾶ α
893+
ᾷ α
894+
Ᾰ Α
895+
Ᾱ Α
896+
Ὰ Α
897+
ᾼ Α
898+
ῂ η
899+
ῃ η
900+
ῄ η
901+
ῆ η
902+
ῇ η
903+
Ὲ Ε
904+
Ὴ Η
905+
ῌ Η
906+
ῐ ι
907+
ῑ ι
908+
ῒ ι
909+
ῖ ι
910+
ῗ ι
911+
Ῐ Ι
912+
Ῑ Ι
913+
Ὶ Ι
914+
ῠ υ
915+
ῡ υ
916+
ῢ υ
917+
ῤ ρ
918+
ῥ ρ
919+
ῦ υ
920+
ῧ υ
921+
Ῠ Υ
922+
Ῡ Υ
923+
Ὺ Υ
924+
Ῥ Ρ
925+
ῲ ω
926+
ῳ ω
927+
ῴ ω
928+
ῶ ω
929+
ῷ ω
930+
Ὸ Ο
931+
Ὼ Ω
932+
ῼ Ω
712933
‐ -
713934
‑ -
714935
‒ -

0 commit comments

Comments
 (0)