196 lines
		
	
	
		
			4.6 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			196 lines
		
	
	
		
			4.6 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| package charset
 | |
| 
 | |
| import (
 | |
| 	"fmt"
 | |
| 	"unicode/utf8"
 | |
| )
 | |
| 
 | |
| func init() {
 | |
| 	registerClass("cp932", fromCP932, nil)
 | |
| }
 | |
| 
 | |
| // encoding details
 | |
| // (Traditional) Shift-JIS
 | |
| //
 | |
| // 00..1f	control characters
 | |
| // 20		space
 | |
| // 21..7f	JIS X 0201:1976/1997 roman (see notes)
 | |
| // 80		undefined
 | |
| // 81..9f	lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997
 | |
| // a0		undefined
 | |
| // a1..df	JIS X 0201:1976/1997 katakana
 | |
| // e0..ea	lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997
 | |
| // eb..ff	undefined
 | |
| //
 | |
| // CP932 (windows-31J)
 | |
| //
 | |
| // this encoding scheme extends Shift-JIS in the following way
 | |
| //
 | |
| // eb..ec	undefined (marked as lead bytes - see notes below)
 | |
| // ed..ee	lead byte of NEC-selected IBM extended characters
 | |
| // ef		undefined (marked as lead byte - see notes below)
 | |
| // f0..f9	lead byte of User defined GAIJI (see note below)
 | |
| // fa..fc	lead byte of IBM extended characters
 | |
| // fd..ff	undefined
 | |
| //
 | |
| //
 | |
| // Notes
 | |
| //
 | |
| // JISX 0201:1976/1997 roman
 | |
| //	this is the same as ASCII but with 0x5c (ASCII code for '\')
 | |
| //	representing the Yen currency symbol '¥' (U+00a5)
 | |
| //	This mapping is contentious, some conversion packages implent it
 | |
| //	others do not.
 | |
| //	The mapping files from The Unicode Consortium show cp932 mapping
 | |
| //	plain ascii in the range 00..7f whereas shift-jis maps 0x5c ('\') to the yen
 | |
| //	symbol (¥) and 0x7e ('~') to overline (¯)
 | |
| //
 | |
| // CP932 double-byte character codes:
 | |
| //
 | |
| // eb-ec, ef, f0-f9:
 | |
| // 	Marked as DBCS LEAD BYTEs in the unicode mapping data
 | |
| //	obtained from:
 | |
| //		https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT
 | |
| //
 | |
| // 	but there are no defined mappings for codes in this range.
 | |
| // 	It is not clear whether or not an implementation should
 | |
| // 	consume one or two bytes before emitting an error char.
 | |
| 
 | |
| const (
 | |
| 	kanaPages    = 1
 | |
| 	kanaPageSize = 63
 | |
| 	kanaChar0    = 0xa1
 | |
| 
 | |
| 	cp932Pages    = 45  // 81..84, 87..9f, e0..ea, ed..ee, fa..fc
 | |
| 	cp932PageSize = 189 // 40..fc (including 7f)
 | |
| 	cp932Char0    = 0x40
 | |
| )
 | |
| 
 | |
| type jisTables struct {
 | |
| 	page0   [256]rune
 | |
| 	dbcsoff [256]int
 | |
| 	cp932   []rune
 | |
| }
 | |
| 
 | |
| type translateFromCP932 struct {
 | |
| 	tables  *jisTables
 | |
| 	scratch []byte
 | |
| }
 | |
| 
 | |
| func (p *translateFromCP932) Translate(data []byte, eof bool) (int, []byte, error) {
 | |
| 	tables := p.tables
 | |
| 	p.scratch = p.scratch[:0]
 | |
| 	n := 0
 | |
| 	for i := 0; i < len(data); i++ {
 | |
| 		b := data[i]
 | |
| 		r := tables.page0[b]
 | |
| 		if r != -1 {
 | |
| 			p.scratch = appendRune(p.scratch, r)
 | |
| 			n++
 | |
| 			continue
 | |
| 		}
 | |
| 		// DBCS
 | |
| 		i++
 | |
| 		if i >= len(data) {
 | |
| 			break
 | |
| 		}
 | |
| 		pnum := tables.dbcsoff[b]
 | |
| 		ix := int(data[i]) - cp932Char0
 | |
| 		if pnum == -1 || ix < 0 || ix >= cp932PageSize {
 | |
| 			r = utf8.RuneError
 | |
| 		} else {
 | |
| 			r = tables.cp932[pnum*cp932PageSize+ix]
 | |
| 		}
 | |
| 		p.scratch = appendRune(p.scratch, r)
 | |
| 		n += 2
 | |
| 	}
 | |
| 	return n, p.scratch, nil
 | |
| }
 | |
| 
 | |
| type cp932Key bool
 | |
| 
 | |
| func fromCP932(arg string) (Translator, error) {
 | |
| 	shiftJIS := arg == "shiftjis"
 | |
| 	tables, err := cache(cp932Key(shiftJIS), func() (interface{}, error) {
 | |
| 		tables := new(jisTables)
 | |
| 		kana, err := jisGetMap("jisx0201kana.dat", kanaPageSize, kanaPages)
 | |
| 		if err != nil {
 | |
| 			return nil, err
 | |
| 		}
 | |
| 		tables.cp932, err = jisGetMap("cp932.dat", cp932PageSize, cp932Pages)
 | |
| 		if err != nil {
 | |
| 			return nil, err
 | |
| 		}
 | |
| 
 | |
| 		// jisx0201kana is mapped into 0xA1..0xDF
 | |
| 		for i := 0; i < kanaPageSize; i++ {
 | |
| 			tables.page0[i+kanaChar0] = kana[i]
 | |
| 		}
 | |
| 
 | |
| 		// 00..7f same as ascii in cp932
 | |
| 		for i := rune(0); i < 0x7f; i++ {
 | |
| 			tables.page0[i] = i
 | |
| 		}
 | |
| 
 | |
| 		if shiftJIS {
 | |
| 			// shift-jis uses JIS X 0201 for the ASCII range
 | |
| 			// this is the same as ASCII apart from
 | |
| 			// 0x5c ('\') maps to yen symbol (¥) and 0x7e ('~') maps to overline (¯)
 | |
| 			tables.page0['\\'] = '¥'
 | |
| 			tables.page0['~'] = '¯'
 | |
| 		}
 | |
| 
 | |
| 		// pre-calculate DBCS page numbers to mapping file page numbers
 | |
| 		// and mark codes in page0 that are DBCS lead bytes
 | |
| 		pnum := 0
 | |
| 		for i := 0x81; i <= 0x84; i++ {
 | |
| 			tables.page0[i] = -1
 | |
| 			tables.dbcsoff[i] = pnum
 | |
| 			pnum++
 | |
| 		}
 | |
| 		for i := 0x87; i <= 0x9f; i++ {
 | |
| 			tables.page0[i] = -1
 | |
| 			tables.dbcsoff[i] = pnum
 | |
| 			pnum++
 | |
| 		}
 | |
| 		for i := 0xe0; i <= 0xea; i++ {
 | |
| 			tables.page0[i] = -1
 | |
| 			tables.dbcsoff[i] = pnum
 | |
| 			pnum++
 | |
| 		}
 | |
| 		if shiftJIS {
 | |
| 			return tables, nil
 | |
| 		}
 | |
| 		// add in cp932 extensions
 | |
| 		for i := 0xed; i <= 0xee; i++ {
 | |
| 			tables.page0[i] = -1
 | |
| 			tables.dbcsoff[i] = pnum
 | |
| 			pnum++
 | |
| 		}
 | |
| 		for i := 0xfa; i <= 0xfc; i++ {
 | |
| 			tables.page0[i] = -1
 | |
| 			tables.dbcsoff[i] = pnum
 | |
| 			pnum++
 | |
| 		}
 | |
| 		return tables, nil
 | |
| 	})
 | |
| 
 | |
| 	if err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 
 | |
| 	return &translateFromCP932{tables: tables.(*jisTables)}, nil
 | |
| }
 | |
| 
 | |
| func jisGetMap(name string, pgsize, npages int) ([]rune, error) {
 | |
| 	data, err := readFile(name)
 | |
| 	if err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 	m := []rune(string(data))
 | |
| 	if len(m) != pgsize*npages {
 | |
| 		return nil, fmt.Errorf("%q: incorrect length data", name)
 | |
| 	}
 | |
| 	return m, nil
 | |
| }
 | 
