41PLAIN_LETTER_RANGES = ((
ord(
'a'),
ord(
'z')),
54COMBINING_MARK_RANGES = ((0x0300, 0x0362),
63 if (
' ' in letter)
or (
'"' in letter):
65 output = chr(codepoint) +
"\t" + letter
67 output = chr(codepoint)
73 def __init__(self, id, general_category, combining_ids):
80 """Return true if this is a combining mark to remove."""
84 for begin, end
in COMBINING_MARK_RANGES:
91 """Return true if codepoint represents a "plain letter"."""
92 for begin, end
in PLAIN_LETTER_RANGES:
99 """Returns true for diacritical marks (combining codepoints)."""
104 """Returns true for letters combined with one or more marks."""
127 """Return true for letter with or without diacritical marks."""
132 """Return the base codepoint without marks. If this codepoint has more
133 than one combining character, do a recursive lookup on the table to
134 find out its plain base letter."""
151 """Return true for letters combined with letters."""
155 """Return a list of plain letters from a ligature."""
161 """Parse the XML file and return a set of tuples (src, trg), where "src"
162 is the original character and "trg" the substitute."""
163 charactersSet = set()
166 rulePattern =
re.compile(
r'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;')
169 transliterationTree =
ET.parse(latinAsciiFilePath)
193 if matches
is not None:
214 """Returns the special cases which are not handled by other methods"""
215 charactersSet = set()
230 decomposition_type_pattern =
re.compile(
" *<[^>]*> *")
236 charactersSet = set()
241 )
as unicodeDataFile:
243 for line
in unicodeDataFile:
247 general_category = fields[2]
248 decomposition = fields[5]
249 decomposition =
re.sub(decomposition_type_pattern,
' ', decomposition)
250 id =
int(fields[0], 16)
252 codepoint =
Codepoint(id, general_category, combining_ids)
253 table[id] = codepoint
257 for codepoint
in all:
266 for combining_codepoint
277 charactersList = sorted(charactersSet, key=
lambda characterPair: characterPair[0])
279 for characterPair
in charactersList:
283if __name__ ==
"__main__":
284 parser =
argparse.ArgumentParser(description=
'This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.')
285 parser.add_argument(
"--unicode-data-file", help=
"Path to formatted text file corresponding to UnicodeData.txt.", type=str, required=
True, dest=
'unicodeDataFilePath')
286 parser.add_argument(
"--latin-ascii-file", help=
"Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml).", type=str, dest=
'latinAsciiFilePath')
287 parser.add_argument(
"--no-ligatures-expansion", help=
"Do not expand ligatures and do not use Unicode CLDR Latin-ASCII transliterator. By default, this option is not enabled and \"--latin-ascii-file\" argument is required. If this option is enabled, \"--latin-ascii-file\" argument is optional and ignored.", action=
"store_true", dest=
'noLigaturesExpansion')
291 sys.stderr.write(
'You must specify the path to Latin-ASCII transliterator file with \"--latin-ascii-file\" option or use \"--no-ligatures-expansion\" option. Use \"-h\" option for help.')
void print(const void *obj)
__init__(self, id, general_category, combining_ids)
is_ligature(codepoint, table)
is_mark_to_remove(codepoint)
is_plain_letter(codepoint)
is_letter_with_marks(codepoint, table)
is_letter(codepoint, table)
parse_cldr_latin_ascii_transliterator(latinAsciiFilePath)
print_record(codepoint, letter)
get_plain_letter(codepoint, table)
get_plain_letters(codepoint, table)