らんだむな記憶

blogというものを体験してみようか!的なー

Shift_JISの亜種間での比較

の差を比較したい。特に後者の2つがSHIFTJIS.TXTから何が変化しているか(機種依存文字)。

#! /usr/bin/env python
# -*- coding: utf-8 -*-

import os, sys, re
from collections import namedtuple

Sjis = namedtuple('Sjis', ('sjis_code', 'unis', 'name'))

def parse_sjis_definition(file):
    sjis2info = {}
    with open(file) as f:
        for lineno, line in enumerate(f.readlines()):
            line = line.rstrip()
            if not line:
                continue
            number_sign = line.index("#")
            if number_sign < 0:
                continue
            codes, name = line[:number_sign].strip(), line[number_sign+1:].strip()
            codes = re.split(r"\s+", codes)
            if len(codes) != 2:
                continue
            sjis_code = int(codes[0], 16)
            try:
                uni = int(codes[1], 16)
                sjis2info[sjis_code] = Sjis(sjis_code=sjis_code, unis=[uni], name=name)
            except:
                unis = list(map(lambda v: int(v, 16), codes[1].split("+")))
                sjis2info[sjis_code] = Sjis(sjis_code=sjis_code, unis=unis, name=name)
    return sjis2info

def compact_codes(codes):
    ccodes = []
    s = e = -1
    for code in codes:
        if e < 0:
            s = e = code
            continue

        if code == e + 1:
            e = code
        else:
            if s == e:
                ccodes.append("0x{:X}".format(s))
            else:
                ccodes.append("0x{:X}-0x{:X}".format(s, e))
            s = e = code

    if s == e:
        ccodes.append("0x{:X}".format(s))
    else:
        ccodes.append("0x{:X}-0x{:X}".format(s, e))

    return ",".join(ccodes)

def diff(sjis2info1, sjis2info2):
    def show_diff(diff_sjis_codes, sjis2info):
        for sjis_code in sorted(diff_sjis_codes):
            info = sjis2info[sjis_code]
            if info.sjis_code > 0x7f and len(info.unis) <= 1:
                print("0x{:X}	0x{:04X}	# {} ({})".format(info.sjis_code, info.unis[0], info.name, chr(info.unis[0])))
            else:
                unis = "+".join(list(map(lambda v: "0x{:04X}".format(v), info.unis)))
                print("0x{:X}	{}	# {}".format(info.sjis_code, unis, info.name))

    sjis_codes1 = sjis2info1.keys()
    sjis_codes2 = sjis2info2.keys()
    diff_sjis_codes = sjis_codes1 - sjis_codes2
    if diff_sjis_codes:
        print("[only first file has extra {} codes... ({})]".format(len(diff_sjis_codes), compact_codes(sorted(diff_sjis_codes))))
        show_diff(diff_sjis_codes, sjis2info1)
    diff_sjis_codes = sjis_codes2 - sjis_codes1
    if diff_sjis_codes:
        print("[only second file has extra {} codes... ({})]".format(len(diff_sjis_codes), compact_codes(sorted(diff_sjis_codes))))
        show_diff(diff_sjis_codes, sjis2info2)

def main():
    sjis2info1 = parse_sjis_definition(sys.argv[1])
    sjis2info2 = parse_sjis_definition(sys.argv[2])
    diff(sjis2info1, sjis2info2)

if __name__ == "__main__":
    main()

くらいでまぁ比較できる。

結果

SHIFTJIS.TXT vs CP932.TXT

[only second file has extra 878 codes... (0x0-0x1F,0x7F,0x8740-0x875D,0x875F-0x8775,0x877E,0x8780-0x879C,0xED40-0xED7E,0xED80-0xEDFC,0xEE40-0xEE7E,0xEE80-0xEEEC,0xEEEF-0xEEFC,0xFA40-0xFA7E,0xFA80-0xFAFC,0xFB40-0xFB7E,0xFB80-0xFBFC,0xFC40-0xFC4B)]

SHIFTJIS.TXT vs JAPANESE.TXT

[only second file has extra 318 codes... (0x80,0xA0,0xFD-0xFF,0x8540-0x8553,0x855E-0x8571,0x857C-0x857E,0x8580-0x8585,0x8591-0x859A,0x859F-0x85AD,0x85B3-0x85C1,0x85DB-0x85F4,0x8640-0x865D,0x869B-0x86A6,0x86B3-0x86B5,0x86C7-0x86D6,0x8740-0x8758,0x8791-0x87B5,0x87BD-0x87C1,0x87E5-0x87E8,0x87FA-0x87FC,0x8840-0x8842,0x8854-0x8855,0x8868,0x886A-0x886D,0xEB41-0xEB42,0xEB50-0xEB51,0xEB5B-0xEB5D,0xEB60-0xEB64,0xEB69-0xEB7A,0xEB81,0xEC9F,0xECA1,0xECA3,0xECA5,0xECA7,0xECC1,0xECE1,0xECE3,0xECE5,0xECEC,0xED40,0xED42,0xED44,0xED46,0xED48,0xED62,0xED83,0xED85,0xED87,0xED8E,0xED95-0xED96)]

ほぉ。

0xFD	0x00A9	# COPYRIGHT SIGN # Apple addition (©)

のコピーライトサインってMacJapaneseだけなのか・・・。