#!/usr/bin/env python # # FreeBSD UTF-8 LC_CTYPE Generator # # by Hye-Shik Chang # import re, sys ALPHA = 0x0001 CONTROL = 0x0002 DIGIT = 0x0004 GRAPH = 0x0008 LOWER = 0x0010 PUNCT = 0x0020 SPACE = 0x0040 UPPER = 0x0080 XDIGIT = 0x0100 BLANK = 0x0200 PRINT = 0x0400 IDEOGRAM = 0x0800 SPECIAL = 0x1000 PHONOGRAM = 0x2000 BSD_CTYPES = ( (ALPHA, 'ALPHA '), (CONTROL, 'CONTROL '), (DIGIT, 'DIGIT '), (GRAPH, 'GRAPH '), (LOWER, 'LOWER '), (PUNCT, 'PUNCT '), (SPACE, 'SPACE '), (UPPER, 'UPPER '), (XDIGIT, 'XDIGIT '), (BLANK, 'BLANK '), (PRINT, 'PRINT '), (IDEOGRAM, 'IDEOGRAM '), (SPECIAL, 'SPECIAL '), (PHONOGRAM, 'PHONOGRAM'), ) CLASSES = { '': 0, 'C': CONTROL, # Other 'Cc': CONTROL, # Control 'Cf': CONTROL, # Format 'Cn': 0, # Unassigned 'Co': PRINT | GRAPH, # Private_Use 'Cs': PRINT, # Surrogate 'L': PRINT | GRAPH, # Letter 'LC': PRINT | GRAPH | ALPHA, # Cased_Letter 'Ll': PRINT | GRAPH | ALPHA | LOWER, # Lowercase_Letter 'Lm': PRINT | GRAPH, # Modifier_Letter 'Lo': PRINT | GRAPH, # Other_Letter 'Lt': PRINT | GRAPH | ALPHA, # Titlecase_Letter 'Lu': PRINT | GRAPH | ALPHA | UPPER, # Uppercase_Letter 'M': PRINT | GRAPH, # Mark 'Mc': PRINT | GRAPH, # Spacing_Mark 'Me': PRINT | GRAPH, # Enclosing_Mark 'Mn': PRINT | GRAPH, # Nonspacing_Mark 'N': PRINT | GRAPH | DIGIT, # Number 'Nd': PRINT | GRAPH | DIGIT, # Decimal_Number 'Nl': PRINT | GRAPH | SPECIAL, # Letter_Number 'No': PRINT | GRAPH | SPECIAL, # Other_Number 'P': PRINT | GRAPH | PUNCT, # Punctuation 'Pc': PRINT | GRAPH | PUNCT, # Connector_Punctuation 'Pd': PRINT | GRAPH | PUNCT, # Dash_Punctuation 'Pe': PRINT | GRAPH | PUNCT, # Close_Punctuation 'Pf': PRINT | GRAPH | PUNCT, # Final_Punctuation 'Pi': PRINT | GRAPH | PUNCT, # Initial_Punctuation 'Po': PRINT | GRAPH | PUNCT, # Other_Punctuation 'Ps': PRINT | GRAPH | PUNCT, # Open_Punctuation 'S': PRINT | GRAPH | PUNCT, # Symbol 'Sc': PRINT | GRAPH | PUNCT, # Currency_Symbol 'Sk': PRINT | GRAPH | PUNCT, # Modifier_Symbol 'Sm': PRINT | GRAPH | PUNCT, # Math_Symbol 'So': PRINT | GRAPH | PUNCT, # Other_Symbol 'Z': PRINT | SPACE, # Separator 'Zl': PRINT | SPACE, # Line_Separator 'Zp': PRINT | SPACE, # Paragraph_Separator 'Zs': PRINT | SPACE | BLANK, # Space_Separator } BIDIRECT_PROPS = { '': 0, 'AL': 0, # Arabic_Letter 'AN': 0, # Arabic_Number 'B': SPACE, # Paragraph_Separator 'BN': 0, # Boundary_Neutral 'CS': 0, # Common_Separator 'EN': 0, # European_Number 'ES': 0, # European_Separator 'ET': 0, # European_Terminator 'L': 0, # Left_To_Right 'LRE': 0, # Left_To_Right_Embedding 'LRO': 0, # Left_To_Right_Override 'NSM': 0, # Nonspacing_Mark 'ON': 0, # Other_Neutral 'PDF': 0, # Pop_Directional_Format 'R': 0, # Right_To_Left 'RLE': 0, # Right_To_Left_Embedding 'RLO': 0, # Right_To_Left_Override 'S': BLANK | SPACE, # Segment_Separator 'WS': SPACE, # White_Space } EXCEPTIONS = { 0x001c: CONTROL, 0x001d: CONTROL, 0x001e: CONTROL, 0x001f: CONTROL, } XDIGITMASK = re.compile('LETTER [A-F]$') IDEOGRAMMASK = re.compile('[Ii][Dd][Ee][Oo][Gg][Rr]') PHONOGRAMMASK = re.compile('[Ss][Yy][Ll][Ll][Aa][Bb]|[Hh][Aa][Nn][Gg][Uu][Ll][ ][Ll][Ee][Tt][Tt]|[Hh][Ii][Rr][Aa][Gg][Aa][Nn][Aa]|[Kk][Aa][Tt][Aa][Kk][Aa][Nn][Aa]') FILEHEADER = """\ /* * UTF-8 LC_CTYPE definitions * * Copyright (c) 2002 Hye-Shik Chang. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ ENCODING "UTF2" VARIABLE UTF-8 Character Types""" BLOCKHEADER = """ /* * %(area)s : %(name)s */ """ hex2int = lambda x: eval('0x'+x) repr_code = lambda c: (c < 0x7f and chr(c).isalnum()) and ("'%s'" % chr(c)) or ('0x%04x' % c) def repr_codearea(st, en): if st == en: return repr_code(st) if abs(st - en) <= 1: return "%s %s" % (repr_code(st), repr_code(en)) else: return "%s - %s" % (repr_code(st), repr_code(en)) class CodeArea: def __init__(self, st, en=None): self.st = st self.en = en def isincode(self, c): if self.en is None: return c == self.st else: return self.st <= c <= self.en def __hash__(self): return hash((self.st, self.en)) def __cmp__(self, o): return cmp(self.st, o.st) def __repr__(self): if self.en is None: return 'U+%04X' % self.st else: return 'U+%04X - U+%04X' % (self.st, self.en) class MapStack: def __init__(self, tag): self.tag = tag self.data = [] def __repr__(self): r = [] for d in self.data: if d[0] == d[1]: r.append('%-9s < %s %s >' % (self.tag, repr_code(d[0]), repr_code(d[2]))) else: r.append('%-9s < %s - %s : %s >' % ( self.tag, repr_code(d[0]), repr_code(d[1]), repr_code(d[2]) ) ) return '\n'.join(r) def takemaps(self, st_code, en_code): m = MapStack(self.tag) for ist, ien, idst, iden in self.data: if st_code <= ist and ien <= en_code: m.data.append([ist, ien, idst, iden]) #elif: .. no splitting needed on UCS2 return m def add(self, idx, val): if (self.data and idx - self.data[-1][1] == 1 and val - self.data[-1][3] == 1): self.data[-1][1] = idx self.data[-1][3] = val else: self.data.append([idx, idx, val, val]) def __nonzero__(self): return self.data and 1 or 0 class UnicodeData: def __init__(self, filepath): self.data = [] self.load(open(filepath)) self.tag_bsdctype() def load(self, fo): self.data = [None] * 65536 pdata = [] for l in fo.readlines(): l = l.split('#', 1)[0] if l.strip(): code, value = l.split(';', 1) if len(code) >= 5: # FreeBSD doesn't have UCS-4 yet. continue pdata.append([eval('0x'+code)] + value.strip().split(';')) if pdata[-1][1].endswith('Last>'): en = pdata.pop() st = pdata.pop() extname = st[1].replace('First', 'Element') for c in range(st[0], en[0]+1): pdata.append([c,extname] + st[2:]) for p in pdata: self.data[p[0]] = p[1:] def tag_bsdctype(self): self.uppermap = MapStack('MAPUPPER') self.lowermap = MapStack('MAPLOWER') self.digitmap = MapStack('TODIGIT') for i in range(65536): if EXCEPTIONS.has_key(i): self[i].append(EXCEPTIONS[i]) elif self[i]: self[i].append(CLASSES[self[i][1]] | BIDIRECT_PROPS[self[i][3]]) if i < 128 and (self[i][-1] & DIGIT or XDIGITMASK.search(self[i][0])): self[i][-1] |= XDIGIT elif self[i][1] == 'Lo': if IDEOGRAMMASK.search(self[i][0]): self[i][-1] |= IDEOGRAM elif PHONOGRAMMASK.search(self[i][0]): self[i][-1] |= PHONOGRAM if self[i]: if self[i][11]: self.uppermap.add(i, hex2int(self[i][11])) if self[i][12]: self.lowermap.add(i, hex2int(self[i][12])) if self[i][5]: self.digitmap.add(i, hex2int(self[i][5])) def __getitem__(self, key): return self.data.__getitem__(key) def __setitem__(self, key, value): self.data.__setitem__(key, value) class BlockData(list): def __init__(self, filepath): self.load(open(filepath)) def load(self, fo): del self[:] for l in fo.readlines(): l = l.split('#', 1)[0] if l.strip(): code, value = l.split(';', 1) code = [eval('0x'+m) for m in code.strip().split('..')] if code[0] > 0xffff: # FreeBSD doesn't have UCS-4 yet. continue self.append((CodeArea(*code), value.strip())) def generate_ctype(blocks, codes): print FILEHEADER for area, name in blocks: print BLOCKHEADER % locals() blockcodes = range(area.st, area.en+1) for mask, fname in BSD_CTYPES: cont = [] for c in blockcodes: if codes[c] is not None and mask & codes[c][-1]: if cont and c - cont[-1][1] == 1: cont[-1][1] = c else: cont.append([c, c]) ob = '' while cont: if ob: ob += ' ' + repr_codearea(*cont.pop(0)) else: ob += repr_codearea(*cont.pop(0)) if len(ob) > 60: print fname, ob ob = '' if ob: print fname, ob mapprinted = False for map in [codes.uppermap, codes.lowermap, codes.digitmap]: mapsinarea = map.takemaps(area.st, area.en) if mapsinarea: if not mapprinted: print mapprinted = True print mapsinarea if area.st == 0: print "TODIGIT < 'A' - 'F' : 10 > < 'a' - 'f' : 10 >" if __name__ == '__main__': BLOCKSFILE = BlockData("Blocks.txt") UNICODEFILE = UnicodeData("UnicodeData.txt") generate_ctype(BLOCKSFILE, UNICODEFILE)