- SHIFTJIS.TXT
- CP932.TXT
- JAPANESE.TXT(たぶんMacJapanese)
の差を比較したい。特に後者の2つがSHIFTJIS.TXTから何が変化しているか(機種依存文字)。
#! /usr/bin/env python # -*- coding: utf-8 -*- import os, sys, re from collections import namedtuple Sjis = namedtuple('Sjis', ('sjis_code', 'unis', 'name')) def parse_sjis_definition(file): sjis2info = {} with open(file) as f: for lineno, line in enumerate(f.readlines()): line = line.rstrip() if not line: continue number_sign = line.index("#") if number_sign < 0: continue codes, name = line[:number_sign].strip(), line[number_sign+1:].strip() codes = re.split(r"\s+", codes) if len(codes) != 2: continue sjis_code = int(codes[0], 16) try: uni = int(codes[1], 16) sjis2info[sjis_code] = Sjis(sjis_code=sjis_code, unis=[uni], name=name) except: unis = list(map(lambda v: int(v, 16), codes[1].split("+"))) sjis2info[sjis_code] = Sjis(sjis_code=sjis_code, unis=unis, name=name) return sjis2info def compact_codes(codes): ccodes = [] s = e = -1 for code in codes: if e < 0: s = e = code continue if code == e + 1: e = code else: if s == e: ccodes.append("0x{:X}".format(s)) else: ccodes.append("0x{:X}-0x{:X}".format(s, e)) s = e = code if s == e: ccodes.append("0x{:X}".format(s)) else: ccodes.append("0x{:X}-0x{:X}".format(s, e)) return ",".join(ccodes) def diff(sjis2info1, sjis2info2): def show_diff(diff_sjis_codes, sjis2info): for sjis_code in sorted(diff_sjis_codes): info = sjis2info[sjis_code] if info.sjis_code > 0x7f and len(info.unis) <= 1: print("0x{:X} 0x{:04X} # {} ({})".format(info.sjis_code, info.unis[0], info.name, chr(info.unis[0]))) else: unis = "+".join(list(map(lambda v: "0x{:04X}".format(v), info.unis))) print("0x{:X} {} # {}".format(info.sjis_code, unis, info.name)) sjis_codes1 = sjis2info1.keys() sjis_codes2 = sjis2info2.keys() diff_sjis_codes = sjis_codes1 - sjis_codes2 if diff_sjis_codes: print("[only first file has extra {} codes... ({})]".format(len(diff_sjis_codes), compact_codes(sorted(diff_sjis_codes)))) show_diff(diff_sjis_codes, sjis2info1) diff_sjis_codes = sjis_codes2 - sjis_codes1 if diff_sjis_codes: print("[only second file has extra {} codes... ({})]".format(len(diff_sjis_codes), compact_codes(sorted(diff_sjis_codes)))) show_diff(diff_sjis_codes, sjis2info2) def main(): sjis2info1 = parse_sjis_definition(sys.argv[1]) sjis2info2 = parse_sjis_definition(sys.argv[2]) diff(sjis2info1, sjis2info2) if __name__ == "__main__": main()
くらいでまぁ比較できる。
結果
SHIFTJIS.TXT vs CP932.TXT
[only second file has extra 878 codes... (0x0-0x1F,0x7F,0x8740-0x875D,0x875F-0x8775,0x877E,0x8780-0x879C,0xED40-0xED7E,0xED80-0xEDFC,0xEE40-0xEE7E,0xEE80-0xEEEC,0xEEEF-0xEEFC,0xFA40-0xFA7E,0xFA80-0xFAFC,0xFB40-0xFB7E,0xFB80-0xFBFC,0xFC40-0xFC4B)]
SHIFTJIS.TXT vs JAPANESE.TXT
[only second file has extra 318 codes... (0x80,0xA0,0xFD-0xFF,0x8540-0x8553,0x855E-0x8571,0x857C-0x857E,0x8580-0x8585,0x8591-0x859A,0x859F-0x85AD,0x85B3-0x85C1,0x85DB-0x85F4,0x8640-0x865D,0x869B-0x86A6,0x86B3-0x86B5,0x86C7-0x86D6,0x8740-0x8758,0x8791-0x87B5,0x87BD-0x87C1,0x87E5-0x87E8,0x87FA-0x87FC,0x8840-0x8842,0x8854-0x8855,0x8868,0x886A-0x886D,0xEB41-0xEB42,0xEB50-0xEB51,0xEB5B-0xEB5D,0xEB60-0xEB64,0xEB69-0xEB7A,0xEB81,0xEC9F,0xECA1,0xECA3,0xECA5,0xECA7,0xECC1,0xECE1,0xECE3,0xECE5,0xECEC,0xED40,0xED42,0xED44,0xED46,0xED48,0xED62,0xED83,0xED85,0xED87,0xED8E,0xED95-0xED96)]
ほぉ。
0xFD 0x00A9 # COPYRIGHT SIGN # Apple addition (©)
のコピーライトサインってMacJapaneseだけなのか・・・。