#!/usr/bin/env python3 # # fontconfig/fc-case/fc-case.py # # Copyright © 2004 Keith Packard # Copyright © 2019 Tim-Philipp Müller # # Permission to use, copy, modify, distribute, and sell this software and its # documentation for any purpose is hereby granted without fee, provided that # the above copyright notice appear in all copies and that both that # copyright notice and this permission notice appear in supporting # documentation, and that the name of the author(s) not be used in # advertising or publicity pertaining to distribution of the software without # specific, written prior permission. The authors make no # representations about the suitability of this software for any purpose. It # is provided "as is" without express or implied warranty. # # THE AUTHOR(S) DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, # INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO # EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY SPECIAL, INDIRECT OR # CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, # DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER # TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR # PERFORMANCE OF THIS SOFTWARE. from enum import Enum import argparse import string import sys class CaseFoldClass(Enum): COMMON = 1 FULL = 2 SIMPLE = 3 TURKIC = 4 class CaseFoldMethod(Enum): RANGE = 0 EVEN_ODD = 1 FULL = 2 caseFoldClassMap = { 'C' : CaseFoldClass.COMMON, 'F' : CaseFoldClass.FULL, 'S' : CaseFoldClass.SIMPLE, 'T' : CaseFoldClass.TURKIC } folds = [] def ucs4_to_utf8(ucs4): utf8_rep = [] if ucs4 < 0x80: utf8_rep.append(ucs4) bits = -6 elif ucs4 < 0x800: utf8_rep.append(((ucs4 >> 6) & 0x1F) | 0xC0) bits = 0 elif ucs4 < 0x10000: utf8_rep.append(((ucs4 >> 12) & 0x0F) | 0xE0) bits = 6 elif ucs4 < 0x200000: utf8_rep.append(((ucs4 >> 18) & 0x07) | 0xF0) bits = 12 elif ucs4 < 0x4000000: utf8_rep.append(((ucs4 >> 24) & 0x03) | 0xF8) bits = 18 elif ucs4 < 0x80000000: utf8_rep.append(((ucs4 >> 30) & 0x01) | 0xFC) bits = 24 else: return []; while bits >= 0: utf8_rep.append(((ucs4 >> bits) & 0x3F) | 0x80) bits-= 6 return utf8_rep def utf8_size(ucs4): return len(ucs4_to_utf8(ucs4)) case_fold_method_name_map = { CaseFoldMethod.RANGE: 'FC_CASE_FOLD_RANGE,', CaseFoldMethod.EVEN_ODD: 'FC_CASE_FOLD_EVEN_ODD,', CaseFoldMethod.FULL: 'FC_CASE_FOLD_FULL,', } if __name__=='__main__': parser = argparse.ArgumentParser() parser.add_argument('case_folding_file') parser.add_argument('--template', dest='template_file', default=None) parser.add_argument('--output', dest='output_file', default=None) args = parser.parse_args() minFoldChar = None maxFoldChar = None fold = None foldChars = [] maxFoldChars = 0 maxExpand = 0 # Read the standard Unicode CaseFolding.txt file with open(args.case_folding_file, 'r', encoding='utf-8') as casefile: for cnt, line in enumerate(casefile): if not line or not line[0] in string.hexdigits: continue # print('Line {}: {}'.format(cnt, line.strip())) tokens = line.split('; ') if len(tokens) < 3: print('Not enough tokens in line {}'.format(cnt), file=sys.stderr) sys.exit(1) # Get upper case value upper = int(tokens.pop(0), 16) # Get class cfclass = caseFoldClassMap[tokens.pop(0)] # Get list of result characters lower = list(map(lambda s: int(s,16), tokens.pop(0).split())) # print('\t----> {:04X} {} {}'.format(upper, cfclass, lower)) if not minFoldChar: minFoldChar = upper maxFoldChar = upper; if cfclass in [CaseFoldClass.COMMON, CaseFoldClass.FULL]: if len(lower) == 1: # foldExtends if fold and fold['method'] == CaseFoldMethod.RANGE: foldExtends = (lower[0] - upper) == fold['offset'] and upper == fold['upper'] + fold['count'] elif fold and fold['method'] == CaseFoldMethod.EVEN_ODD: foldExtends = (lower[0] - upper) == 1 and upper == (fold['upper'] + fold['count'] + 1) else: foldExtends = False if foldExtends: # This modifies the last fold item in the array too fold['count'] = upper - fold['upper'] + 1; else: fold = {} fold['upper'] = upper fold['offset'] = lower[0] - upper; if fold['offset'] == 1: fold['method'] = CaseFoldMethod.EVEN_ODD else: fold['method'] = CaseFoldMethod.RANGE fold['count'] = 1 folds.append(fold) expand = utf8_size (lower[0]) - utf8_size(upper) else: fold = {} fold['upper'] = upper fold['method'] = CaseFoldMethod.FULL fold['offset'] = len(foldChars) # add chars for c in lower: utf8_rep = ucs4_to_utf8(c) # print('{} -> {}'.format(c,utf8_rep)) for utf8_char in utf8_rep: foldChars.append(utf8_char) fold['count'] = len(foldChars) - fold['offset'] folds.append(fold) if fold['count'] > maxFoldChars: maxFoldChars = fold['count'] expand = fold['count'] - utf8_size(upper) if expand > maxExpand: maxExpand = expand # Open output file if args.output_file: sys.stdout = open(args.output_file, 'w', encoding='utf-8') # Read the template file if args.template_file: tmpl_file = open(args.template_file, 'r', encoding='utf-8') else: tmpl_file = sys.stdin # Scan the input until the marker is found # FIXME: this is a bit silly really, might just as well harcode # the license header in the script and drop the template for line in tmpl_file: if line.strip() == '@@@': break print(line, end='') # Dump these tables print('#define FC_NUM_CASE_FOLD\t{}'.format(len(folds))) print('#define FC_NUM_CASE_FOLD_CHARS\t{}'.format(len(foldChars))) print('#define FC_MAX_CASE_FOLD_CHARS\t{}'.format(maxFoldChars)) print('#define FC_MAX_CASE_FOLD_EXPAND\t{}'.format(maxExpand)) print('#define FC_MIN_FOLD_CHAR\t0x{:08x}'.format(minFoldChar)) print('#define FC_MAX_FOLD_CHAR\t0x{:08x}'.format(maxFoldChar)) print('') # Dump out ranges print('static const FcCaseFold fcCaseFold[FC_NUM_CASE_FOLD] = {') for f in folds: short_offset = f['offset'] if short_offset < -32367: short_offset += 65536 if short_offset > 32368: short_offset -= 65536 print(' {} 0x{:08x}, {:22s} 0x{:04x}, {:6d} {},'.format('{', f['upper'], case_fold_method_name_map[f['method']], f['count'], short_offset, '}')) print('};\n') # Dump out "other" values print('static const FcChar8\tfcCaseFoldChars[FC_NUM_CASE_FOLD_CHARS] = {') for n, c in enumerate(foldChars): if n == len(foldChars) - 1: end = '' elif n % 16 == 15: end = ',\n' else: end = ',' print('0x{:02x}'.format(c), end=end) print('\n};') # And flush out the rest of the input file for line in tmpl_file: print(line, end='') sys.stdout.flush()