33 import xml.etree.ElementTree
as ET
35 sys.stdout = codecs.getwriter(
'utf8')(sys.stdout.buffer)
41 PLAIN_LETTER_RANGES = ((ord(
'a'), ord(
'z')),
54 COMBINING_MARK_RANGES = ((0x0300, 0x0362),
61 output = chr(codepoint) +
"\t" + letter
63 output = chr(codepoint)
69 def __init__(self, id, general_category, combining_ids):
76 """Return true if this is a combining mark to remove."""
80 for begin, end
in COMBINING_MARK_RANGES:
81 if codepoint.id >= begin
and codepoint.id <= end:
87 """Return true if codepoint represents a "plain letter"."""
88 for begin, end
in PLAIN_LETTER_RANGES:
89 if codepoint.id >= begin
and codepoint.id <= end:
95 """Returns true for diacritical marks (combining codepoints)."""
96 return codepoint.general_category
in (
"Mn",
"Me",
"Mc")
100 """Returns true for letters combined with one or more marks."""
105 if len(codepoint.combining_ids) == 1:
109 if any(
is_mark(table[i])
for i
in codepoint.combining_ids[1:])
is False:
113 codepoint_base = codepoint.combining_ids[0]
122 """Return true for letter with or without diacritical marks."""
127 """Return the base codepoint without marks. If this codepoint has more
128 than one combining character, do a recursive lookup on the table to
129 find out its plain base letter."""
131 if len(table[codepoint.combining_ids[0]].combining_ids) > 1:
134 return table[codepoint.combining_ids[0]]
137 assert False,
'Codepoint U+%0.2X' % codepoint.id
142 assert False,
'Codepoint U+%0.2X' % codepoint.id
146 """Return true for letters combined with letters."""
147 return all(
is_letter(table[i], table)
for i
in codepoint.combining_ids)
151 """Return a list of plain letters from a ligature."""
153 return [
get_plain_letter(table[id], table)
for id
in codepoint.combining_ids]
157 """Parse the XML file and return a set of tuples (src, trg), where "src"
158 is the original character and "trg" the substitute."""
159 charactersSet = set()
162 rulePattern = re.compile(
r'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;')
165 transliterationTree = ET.parse(latinAsciiFilePath)
166 transliterationTreeRoot = transliterationTree.getroot()
171 blockRules = transliterationTreeRoot.findall(
"./transforms/transform/tRule")
175 rules = blockRules[0].text.splitlines()
179 matches = rulePattern.search(rule)
189 if matches
is not None:
190 src = matches.group(1)
if matches.group(1)
is not None else bytes(matches.group(2),
'UTF-8').decode(
'unicode-escape')
191 trg = matches.group(3)
if matches.group(3)
is not None else matches.group(4)
194 trg = trg.replace(
"\\'",
"'").replace(
'\\"',
'"')
198 if not src.isspace()
and not trg.isspace():
199 charactersSet.add((ord(src), trg))
205 """Returns the special cases which are not handled by other methods"""
206 charactersSet = set()
209 charactersSet.add((0x0401,
"\u0415"))
210 charactersSet.add((0x0451,
"\u0435"))
213 charactersSet.add((0x2103,
"\xb0C"))
214 charactersSet.add((0x2109,
"\xb0F"))
221 decomposition_type_pattern = re.compile(
" *<[^>]*> *")
227 charactersSet = set()
231 args.unicodeDataFilePath, mode=
'r', encoding=
'UTF-8',
232 )
as unicodeDataFile:
234 for line
in unicodeDataFile:
235 fields = line.split(
";")
238 general_category = fields[2]
239 decomposition = fields[5]
240 decomposition = re.sub(decomposition_type_pattern,
' ', decomposition)
241 id = int(fields[0], 16)
242 combining_ids = [int(s, 16)
for s
in decomposition.split(
" ")
if s !=
""]
243 codepoint =
Codepoint(id, general_category, combining_ids)
244 table[id] = codepoint
245 all.append(codepoint)
248 for codepoint
in all:
249 if codepoint.general_category.startswith(
'L')
and \
250 len(codepoint.combining_ids) > 1:
252 charactersSet.add((codepoint.id,
254 elif args.noLigaturesExpansion
is False and is_ligature(codepoint, table):
255 charactersSet.add((codepoint.id,
256 "".join(chr(combining_codepoint.id)
257 for combining_codepoint
260 charactersSet.add((codepoint.id,
None))
263 if not args.noLigaturesExpansion:
268 charactersList = sorted(charactersSet, key=
lambda characterPair: characterPair[0])
270 for characterPair
in charactersList:
274 if __name__ ==
"__main__":
275 parser = argparse.ArgumentParser(description=
'This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.')
276 parser.add_argument(
"--unicode-data-file", help=
"Path to formatted text file corresponding to UnicodeData.txt.", type=str, required=
True, dest=
'unicodeDataFilePath')
277 parser.add_argument(
"--latin-ascii-file", help=
"Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml).", type=str, dest=
'latinAsciiFilePath')
278 parser.add_argument(
"--no-ligatures-expansion", help=
"Do not expand ligatures and do not use Unicode CLDR Latin-ASCII transliterator. By default, this option is not enabled and \"--latin-ascii-file\" argument is required. If this option is enabled, \"--latin-ascii-file\" argument is optional and ignored.", action=
"store_true", dest=
'noLigaturesExpansion')
279 args = parser.parse_args()
281 if args.noLigaturesExpansion
is False and args.latinAsciiFilePath
is None:
282 sys.stderr.write(
'You must specify the path to Latin-ASCII transliterator file with \"--latin-ascii-file\" option or use \"--no-ligatures-expansion\" option. Use \"-h\" option for help.')
void print(const void *obj)
def __init__(self, id, general_category, combining_ids)
def get_plain_letters(codepoint, table)
def print_record(codepoint, letter)
def is_plain_letter(codepoint)
def get_plain_letter(codepoint, table)
def is_letter_with_marks(codepoint, table)
def is_ligature(codepoint, table)
def is_letter(codepoint, table)
def is_mark_to_remove(codepoint)
def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath)