33 import xml.etree.ElementTree
as ET
35 sys.stdout = codecs.getwriter(
'utf8')(sys.stdout.buffer)
41 PLAIN_LETTER_RANGES = ((ord(
'a'), ord(
'z')),
54 COMBINING_MARK_RANGES = ((0x0300, 0x0362),
63 if (
' ' in letter)
or (
'"' in letter):
64 letter =
'"' + letter.replace(
'"',
'""') +
'"'
65 output = chr(codepoint) +
"\t" + letter
67 output = chr(codepoint)
73 def __init__(self, id, general_category, combining_ids):
80 """Return true if this is a combining mark to remove."""
84 for begin, end
in COMBINING_MARK_RANGES:
85 if codepoint.id >= begin
and codepoint.id <= end:
91 """Return true if codepoint represents a "plain letter"."""
92 for begin, end
in PLAIN_LETTER_RANGES:
93 if codepoint.id >= begin
and codepoint.id <= end:
99 """Returns true for diacritical marks (combining codepoints)."""
100 return codepoint.general_category
in (
"Mn",
"Me",
"Mc")
104 """Returns true for letters combined with one or more marks."""
109 if len(codepoint.combining_ids) == 1:
113 if any(
is_mark(table[i])
for i
in codepoint.combining_ids[1:])
is False:
117 codepoint_base = codepoint.combining_ids[0]
126 """Return true for letter with or without diacritical marks."""
131 """Return the base codepoint without marks. If this codepoint has more
132 than one combining character, do a recursive lookup on the table to
133 find out its plain base letter."""
135 if len(table[codepoint.combining_ids[0]].combining_ids) > 1:
138 return table[codepoint.combining_ids[0]]
141 assert False,
'Codepoint U+%0.2X' % codepoint.id
146 assert False,
'Codepoint U+%0.2X' % codepoint.id
150 """Return true for letters combined with letters."""
151 return all(
is_letter(table[i], table)
for i
in codepoint.combining_ids)
155 """Return a list of plain letters from a ligature."""
157 return [
get_plain_letter(table[id], table)
for id
in codepoint.combining_ids]
161 """Parse the XML file and return a set of tuples (src, trg), where "src"
162 is the original character and "trg" the substitute."""
163 charactersSet = set()
166 rulePattern = re.compile(
r'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;')
169 transliterationTree = ET.parse(latinAsciiFilePath)
170 transliterationTreeRoot = transliterationTree.getroot()
175 blockRules = transliterationTreeRoot.findall(
"./transforms/transform/tRule")
179 rules = blockRules[0].text.splitlines()
183 matches = rulePattern.search(rule)
193 if matches
is not None:
194 src = matches.group(1)
if matches.group(1)
is not None else bytes(matches.group(2),
'UTF-8').decode(
'unicode-escape')
195 trg = matches.group(3)
if matches.group(3)
is not None else matches.group(4)
198 trg = trg.replace(
"\\'",
"'").replace(
'\\"',
'"')
202 if not src.isspace()
and not trg.isspace():
203 charactersSet.add((ord(src), trg))
209 """Returns the special cases which are not handled by other methods"""
210 charactersSet = set()
213 charactersSet.add((0x0401,
"\u0415"))
214 charactersSet.add((0x0451,
"\u0435"))
217 charactersSet.add((0x2103,
"\xb0C"))
218 charactersSet.add((0x2109,
"\xb0F"))
225 decomposition_type_pattern = re.compile(
" *<[^>]*> *")
231 charactersSet = set()
235 args.unicodeDataFilePath, mode=
'r', encoding=
'UTF-8',
236 )
as unicodeDataFile:
238 for line
in unicodeDataFile:
239 fields = line.split(
";")
242 general_category = fields[2]
243 decomposition = fields[5]
244 decomposition = re.sub(decomposition_type_pattern,
' ', decomposition)
245 id = int(fields[0], 16)
246 combining_ids = [int(s, 16)
for s
in decomposition.split(
" ")
if s !=
""]
247 codepoint =
Codepoint(id, general_category, combining_ids)
248 table[id] = codepoint
249 all.append(codepoint)
252 for codepoint
in all:
253 if codepoint.general_category.startswith(
'L')
and \
254 len(codepoint.combining_ids) > 1:
256 charactersSet.add((codepoint.id,
258 elif args.noLigaturesExpansion
is False and is_ligature(codepoint, table):
259 charactersSet.add((codepoint.id,
260 "".join(chr(combining_codepoint.id)
261 for combining_codepoint
264 charactersSet.add((codepoint.id,
None))
267 if not args.noLigaturesExpansion:
272 charactersList = sorted(charactersSet, key=
lambda characterPair: characterPair[0])
274 for characterPair
in charactersList:
278 if __name__ ==
"__main__":
279 parser = argparse.ArgumentParser(description=
'This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.')
280 parser.add_argument(
"--unicode-data-file", help=
"Path to formatted text file corresponding to UnicodeData.txt.", type=str, required=
True, dest=
'unicodeDataFilePath')
281 parser.add_argument(
"--latin-ascii-file", help=
"Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml).", type=str, dest=
'latinAsciiFilePath')
282 parser.add_argument(
"--no-ligatures-expansion", help=
"Do not expand ligatures and do not use Unicode CLDR Latin-ASCII transliterator. By default, this option is not enabled and \"--latin-ascii-file\" argument is required. If this option is enabled, \"--latin-ascii-file\" argument is optional and ignored.", action=
"store_true", dest=
'noLigaturesExpansion')
283 args = parser.parse_args()
285 if args.noLigaturesExpansion
is False and args.latinAsciiFilePath
is None:
286 sys.stderr.write(
'You must specify the path to Latin-ASCII transliterator file with \"--latin-ascii-file\" option or use \"--no-ligatures-expansion\" option. Use \"-h\" option for help.')
void print(const void *obj)
def __init__(self, id, general_category, combining_ids)
def get_plain_letters(codepoint, table)
def print_record(codepoint, letter)
def is_plain_letter(codepoint)
def get_plain_letter(codepoint, table)
def is_letter_with_marks(codepoint, table)
def is_ligature(codepoint, table)
def is_letter(codepoint, table)
def is_mark_to_remove(codepoint)
def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath)