#!/usr/bin/env python
#
# FreeBSD UTF-8 LC_CTYPE Generator
#
# by Hye-Shik Chang <perky@fallin.lv>
#

import re, sys

ALPHA       = 0x0001
CONTROL     = 0x0002
DIGIT       = 0x0004
GRAPH       = 0x0008
LOWER       = 0x0010
PUNCT       = 0x0020
SPACE       = 0x0040
UPPER       = 0x0080
XDIGIT      = 0x0100
BLANK       = 0x0200
PRINT       = 0x0400
IDEOGRAM    = 0x0800
SPECIAL     = 0x1000
PHONOGRAM   = 0x2000

BSD_CTYPES = (
    (ALPHA,     'ALPHA    '),
    (CONTROL,   'CONTROL  '),
    (DIGIT,     'DIGIT    '),
    (GRAPH,     'GRAPH    '),
    (LOWER,     'LOWER    '),
    (PUNCT,     'PUNCT    '),
    (SPACE,     'SPACE    '),
    (UPPER,     'UPPER    '),
    (XDIGIT,    'XDIGIT   '),
    (BLANK,     'BLANK    '),
    (PRINT,     'PRINT    '),
    (IDEOGRAM,  'IDEOGRAM '),
    (SPECIAL,   'SPECIAL  '),
    (PHONOGRAM, 'PHONOGRAM'),
)

CLASSES = {
    '':   0,
    'C':  CONTROL,                          # Other
    'Cc': CONTROL,                          # Control
    'Cf': CONTROL,                          # Format
    'Cn': 0,                                # Unassigned
    'Co': PRINT | GRAPH,                    # Private_Use
    'Cs': PRINT,                            # Surrogate
    'L':  PRINT | GRAPH,                    # Letter
    'LC': PRINT | GRAPH | ALPHA,            # Cased_Letter
    'Ll': PRINT | GRAPH | ALPHA | LOWER,    # Lowercase_Letter
    'Lm': PRINT | GRAPH,                    # Modifier_Letter
    'Lo': PRINT | GRAPH,                    # Other_Letter
    'Lt': PRINT | GRAPH | ALPHA,            # Titlecase_Letter
    'Lu': PRINT | GRAPH | ALPHA | UPPER,    # Uppercase_Letter
    'M':  PRINT | GRAPH,                    # Mark
    'Mc': PRINT | GRAPH,                    # Spacing_Mark
    'Me': PRINT | GRAPH,                    # Enclosing_Mark
    'Mn': PRINT | GRAPH,                    # Nonspacing_Mark
    'N':  PRINT | GRAPH | DIGIT,            # Number
    'Nd': PRINT | GRAPH | DIGIT,            # Decimal_Number
    'Nl': PRINT | GRAPH | SPECIAL,          # Letter_Number
    'No': PRINT | GRAPH | SPECIAL,          # Other_Number
    'P':  PRINT | GRAPH | PUNCT,            # Punctuation
    'Pc': PRINT | GRAPH | PUNCT,            # Connector_Punctuation
    'Pd': PRINT | GRAPH | PUNCT,            # Dash_Punctuation
    'Pe': PRINT | GRAPH | PUNCT,            # Close_Punctuation
    'Pf': PRINT | GRAPH | PUNCT,            # Final_Punctuation
    'Pi': PRINT | GRAPH | PUNCT,            # Initial_Punctuation
    'Po': PRINT | GRAPH | PUNCT,            # Other_Punctuation
    'Ps': PRINT | GRAPH | PUNCT,            # Open_Punctuation
    'S':  PRINT | GRAPH | PUNCT,            # Symbol
    'Sc': PRINT | GRAPH | PUNCT,            # Currency_Symbol
    'Sk': PRINT | GRAPH | PUNCT,            # Modifier_Symbol
    'Sm': PRINT | GRAPH | PUNCT,            # Math_Symbol
    'So': PRINT | GRAPH | PUNCT,            # Other_Symbol
    'Z':  PRINT | SPACE,                    # Separator
    'Zl': PRINT | SPACE,                    # Line_Separator
    'Zp': PRINT | SPACE,                    # Paragraph_Separator
    'Zs': PRINT | SPACE | BLANK,            # Space_Separator
}

BIDIRECT_PROPS = {
    '':    0,
    'AL':  0,                               # Arabic_Letter
    'AN':  0,                               # Arabic_Number
    'B':   SPACE,                           # Paragraph_Separator
    'BN':  0,                               # Boundary_Neutral
    'CS':  0,                               # Common_Separator
    'EN':  0,                               # European_Number
    'ES':  0,                               # European_Separator
    'ET':  0,                               # European_Terminator
    'L':   0,                               # Left_To_Right
    'LRE': 0,                               # Left_To_Right_Embedding
    'LRO': 0,                               # Left_To_Right_Override
    'NSM': 0,                               # Nonspacing_Mark
    'ON':  0,                               # Other_Neutral
    'PDF': 0,                               # Pop_Directional_Format
    'R':   0,                               # Right_To_Left
    'RLE': 0,                               # Right_To_Left_Embedding
    'RLO': 0,                               # Right_To_Left_Override
    'S':   BLANK | SPACE,                   # Segment_Separator
    'WS':  SPACE,                           # White_Space
}

EXCEPTIONS = {
    0x001c: CONTROL,
    0x001d: CONTROL,
    0x001e: CONTROL,
    0x001f: CONTROL,
}

XDIGITMASK = re.compile('LETTER [A-F]$')
IDEOGRAMMASK = re.compile('[Ii][Dd][Ee][Oo][Gg][Rr]')
PHONOGRAMMASK = re.compile('[Ss][Yy][Ll][Ll][Aa][Bb]|[Hh][Aa][Nn][Gg][Uu][Ll][ ][Ll][Ee][Tt][Tt]|[Hh][Ii][Rr][Aa][Gg][Aa][Nn][Aa]|[Kk][Aa][Tt][Aa][Kk][Aa][Nn][Aa]')

FILEHEADER = """\
/*
 * UTF-8 LC_CTYPE definitions
 *
 * Copyright (c) 2002 Hye-Shik Chang. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * $FreeBSD$
 */

ENCODING	"UTF2"
VARIABLE	UTF-8 Character Types"""

BLOCKHEADER = """

/*
 * %(area)s : %(name)s
 */
"""

hex2int   = lambda x: eval('0x'+x)
repr_code = lambda c: (c < 0x7f and chr(c).isalnum()) and ("'%s'" % chr(c)) or ('0x%04x' % c)

def repr_codearea(st, en):
    if st == en:
        return repr_code(st)
    if abs(st - en) <= 1:
        return "%s  %s" % (repr_code(st), repr_code(en))
    else:
        return "%s - %s" % (repr_code(st), repr_code(en))

class CodeArea:

    def __init__(self, st, en=None):
        self.st = st
        self.en = en

    def isincode(self, c):
        if self.en is None:
            return c == self.st
        else:
            return self.st <= c <= self.en

    def __hash__(self):
        return hash((self.st, self.en))

    def __cmp__(self, o):
        return cmp(self.st, o.st)

    def __repr__(self):
        if self.en is None:
            return 'U+%04X' % self.st
        else:
            return 'U+%04X - U+%04X' % (self.st, self.en)


class MapStack:
    
    def __init__(self, tag):
        self.tag = tag
        self.data = []

    def __repr__(self):
        r = []
        for d in self.data:
            if d[0] == d[1]:
                r.append('%-9s < %s %s >' % (self.tag, repr_code(d[0]), repr_code(d[2])))
            else:
                r.append('%-9s < %s - %s : %s >' % (
                    self.tag, repr_code(d[0]), repr_code(d[1]), repr_code(d[2]) ) )
        return '\n'.join(r)

    def takemaps(self, st_code, en_code):
        m = MapStack(self.tag)
        for ist, ien, idst, iden in self.data:
            if st_code <= ist and ien <= en_code:
                m.data.append([ist, ien, idst, iden])
            #elif: .. no splitting needed on UCS2
        return m

    def add(self, idx, val):
        if (self.data and idx - self.data[-1][1] == 1 and val - self.data[-1][3] == 1):
            self.data[-1][1] = idx
            self.data[-1][3] = val
        else:
            self.data.append([idx, idx, val, val])

    def __nonzero__(self):
        return self.data and 1 or 0


class UnicodeData:

    def __init__(self, filepath):
        self.data = []
        self.load(open(filepath))
        self.tag_bsdctype()

    def load(self, fo):
        self.data = [None] * 65536

        pdata = []
        for l in fo.readlines():
            l = l.split('#', 1)[0]
            if l.strip():
                code, value = l.split(';', 1)
                if len(code) >= 5:
                    # FreeBSD doesn't have UCS-4 yet.
                    continue
                pdata.append([eval('0x'+code)] + value.strip().split(';'))
                if pdata[-1][1].endswith('Last>'):
                    en = pdata.pop()
                    st = pdata.pop()
                    extname = st[1].replace('First', 'Element')
                    for c in range(st[0], en[0]+1):
                        pdata.append([c,extname] + st[2:])

        for p in pdata:
            self.data[p[0]] = p[1:]

    def tag_bsdctype(self):
        self.uppermap = MapStack('MAPUPPER')
        self.lowermap = MapStack('MAPLOWER')
        self.digitmap = MapStack('TODIGIT')

        for i in range(65536):
            if EXCEPTIONS.has_key(i):
                self[i].append(EXCEPTIONS[i])
            elif self[i]:
                self[i].append(CLASSES[self[i][1]] | BIDIRECT_PROPS[self[i][3]])
                if i < 128 and (self[i][-1] & DIGIT or XDIGITMASK.search(self[i][0])):
                    self[i][-1] |= XDIGIT
                elif self[i][1] == 'Lo':
                    if IDEOGRAMMASK.search(self[i][0]):
                        self[i][-1] |= IDEOGRAM
                    elif PHONOGRAMMASK.search(self[i][0]):
                        self[i][-1] |= PHONOGRAM

            if self[i]:
                if self[i][11]:
                    self.uppermap.add(i, hex2int(self[i][11]))
                if self[i][12]:
                    self.lowermap.add(i, hex2int(self[i][12]))
                if self[i][5]:
                    self.digitmap.add(i, hex2int(self[i][5]))


    def __getitem__(self, key):
        return self.data.__getitem__(key)

    def __setitem__(self, key, value):
        self.data.__setitem__(key, value)


class BlockData(list):

    def __init__(self, filepath):
        self.load(open(filepath))

    def load(self, fo):
        del self[:]

        for l in fo.readlines():
            l = l.split('#', 1)[0]
            if l.strip():
                code, value = l.split(';', 1)
                code = [eval('0x'+m) for m in code.strip().split('..')]
                if code[0] > 0xffff:
                    # FreeBSD doesn't have UCS-4 yet.
                    continue
                self.append((CodeArea(*code), value.strip()))


def generate_ctype(blocks, codes):

    print FILEHEADER

    for area, name in blocks:

        print BLOCKHEADER % locals()
        blockcodes = range(area.st, area.en+1)
        
        for mask, fname in BSD_CTYPES:
            cont = []
            for c in blockcodes:
                if codes[c] is not None and mask & codes[c][-1]:
                    if cont and c - cont[-1][1] == 1:
                        cont[-1][1] = c
                    else:
                        cont.append([c, c])

            ob = ''
            while cont:
                if ob:
                    ob += '  ' + repr_codearea(*cont.pop(0))
                else:
                    ob += repr_codearea(*cont.pop(0))

                if len(ob) > 60:
                    print fname, ob
                    ob = ''
            if ob:
                print fname, ob

        mapprinted = False
        for map in [codes.uppermap, codes.lowermap, codes.digitmap]:
            mapsinarea = map.takemaps(area.st, area.en)
            if mapsinarea:
                if not mapprinted:
                    print
                    mapprinted = True
                print mapsinarea

        if area.st == 0:
            print "TODIGIT    < 'A' - 'F' : 10 > < 'a' - 'f' : 10 >"

if __name__ == '__main__':
    BLOCKSFILE = BlockData("Blocks.txt")
    UNICODEFILE = UnicodeData("UnicodeData.txt")
    generate_ctype(BLOCKSFILE, UNICODEFILE)