forked from lug/matterbridge
Update vendor
This commit is contained in:
+27
@@ -0,0 +1,27 @@
|
||||
Copyright (c) 2009 The Go Authors. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Google Inc. nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
+999
@@ -0,0 +1,999 @@
|
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package acme provides an implementation of the
|
||||
// Automatic Certificate Management Environment (ACME) spec.
|
||||
// See https://tools.ietf.org/html/draft-ietf-acme-acme-02 for details.
|
||||
//
|
||||
// Most common scenarios will want to use autocert subdirectory instead,
|
||||
// which provides automatic access to certificates from Let's Encrypt
|
||||
// and any other ACME-based CA.
|
||||
//
|
||||
// This package is a work in progress and makes no API stability promises.
|
||||
package acme
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto"
|
||||
"crypto/ecdsa"
|
||||
"crypto/elliptic"
|
||||
"crypto/rand"
|
||||
"crypto/sha256"
|
||||
"crypto/tls"
|
||||
"crypto/x509"
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"encoding/pem"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"math/big"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"golang.org/x/net/context"
|
||||
"golang.org/x/net/context/ctxhttp"
|
||||
)
|
||||
|
||||
// LetsEncryptURL is the Directory endpoint of Let's Encrypt CA.
|
||||
const LetsEncryptURL = "https://acme-v01.api.letsencrypt.org/directory"
|
||||
|
||||
const (
|
||||
maxChainLen = 5 // max depth and breadth of a certificate chain
|
||||
maxCertSize = 1 << 20 // max size of a certificate, in bytes
|
||||
|
||||
// Max number of collected nonces kept in memory.
|
||||
// Expect usual peak of 1 or 2.
|
||||
maxNonces = 100
|
||||
)
|
||||
|
||||
// CertOption is an optional argument type for Client methods which manipulate
|
||||
// certificate data.
|
||||
type CertOption interface {
|
||||
privateCertOpt()
|
||||
}
|
||||
|
||||
// WithKey creates an option holding a private/public key pair.
|
||||
// The private part signs a certificate, and the public part represents the signee.
|
||||
func WithKey(key crypto.Signer) CertOption {
|
||||
return &certOptKey{key}
|
||||
}
|
||||
|
||||
type certOptKey struct {
|
||||
key crypto.Signer
|
||||
}
|
||||
|
||||
func (*certOptKey) privateCertOpt() {}
|
||||
|
||||
// WithTemplate creates an option for specifying a certificate template.
|
||||
// See x509.CreateCertificate for template usage details.
|
||||
//
|
||||
// In TLSSNIxChallengeCert methods, the template is also used as parent,
|
||||
// resulting in a self-signed certificate.
|
||||
// The DNSNames field of t is always overwritten for tls-sni challenge certs.
|
||||
func WithTemplate(t *x509.Certificate) CertOption {
|
||||
return (*certOptTemplate)(t)
|
||||
}
|
||||
|
||||
type certOptTemplate x509.Certificate
|
||||
|
||||
func (*certOptTemplate) privateCertOpt() {}
|
||||
|
||||
// Client is an ACME client.
|
||||
// The only required field is Key. An example of creating a client with a new key
|
||||
// is as follows:
|
||||
//
|
||||
// key, err := rsa.GenerateKey(rand.Reader, 2048)
|
||||
// if err != nil {
|
||||
// log.Fatal(err)
|
||||
// }
|
||||
// client := &Client{Key: key}
|
||||
//
|
||||
type Client struct {
|
||||
// Key is the account key used to register with a CA and sign requests.
|
||||
// Key.Public() must return a *rsa.PublicKey or *ecdsa.PublicKey.
|
||||
Key crypto.Signer
|
||||
|
||||
// HTTPClient optionally specifies an HTTP client to use
|
||||
// instead of http.DefaultClient.
|
||||
HTTPClient *http.Client
|
||||
|
||||
// DirectoryURL points to the CA directory endpoint.
|
||||
// If empty, LetsEncryptURL is used.
|
||||
// Mutating this value after a successful call of Client's Discover method
|
||||
// will have no effect.
|
||||
DirectoryURL string
|
||||
|
||||
dirMu sync.Mutex // guards writes to dir
|
||||
dir *Directory // cached result of Client's Discover method
|
||||
|
||||
noncesMu sync.Mutex
|
||||
nonces map[string]struct{} // nonces collected from previous responses
|
||||
}
|
||||
|
||||
// Discover performs ACME server discovery using c.DirectoryURL.
|
||||
//
|
||||
// It caches successful result. So, subsequent calls will not result in
|
||||
// a network round-trip. This also means mutating c.DirectoryURL after successful call
|
||||
// of this method will have no effect.
|
||||
func (c *Client) Discover(ctx context.Context) (Directory, error) {
|
||||
c.dirMu.Lock()
|
||||
defer c.dirMu.Unlock()
|
||||
if c.dir != nil {
|
||||
return *c.dir, nil
|
||||
}
|
||||
|
||||
dirURL := c.DirectoryURL
|
||||
if dirURL == "" {
|
||||
dirURL = LetsEncryptURL
|
||||
}
|
||||
res, err := ctxhttp.Get(ctx, c.HTTPClient, dirURL)
|
||||
if err != nil {
|
||||
return Directory{}, err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
c.addNonce(res.Header)
|
||||
if res.StatusCode != http.StatusOK {
|
||||
return Directory{}, responseError(res)
|
||||
}
|
||||
|
||||
var v struct {
|
||||
Reg string `json:"new-reg"`
|
||||
Authz string `json:"new-authz"`
|
||||
Cert string `json:"new-cert"`
|
||||
Revoke string `json:"revoke-cert"`
|
||||
Meta struct {
|
||||
Terms string `json:"terms-of-service"`
|
||||
Website string `json:"website"`
|
||||
CAA []string `json:"caa-identities"`
|
||||
}
|
||||
}
|
||||
if json.NewDecoder(res.Body).Decode(&v); err != nil {
|
||||
return Directory{}, err
|
||||
}
|
||||
c.dir = &Directory{
|
||||
RegURL: v.Reg,
|
||||
AuthzURL: v.Authz,
|
||||
CertURL: v.Cert,
|
||||
RevokeURL: v.Revoke,
|
||||
Terms: v.Meta.Terms,
|
||||
Website: v.Meta.Website,
|
||||
CAA: v.Meta.CAA,
|
||||
}
|
||||
return *c.dir, nil
|
||||
}
|
||||
|
||||
// CreateCert requests a new certificate using the Certificate Signing Request csr encoded in DER format.
|
||||
// The exp argument indicates the desired certificate validity duration. CA may issue a certificate
|
||||
// with a different duration.
|
||||
// If the bundle argument is true, the returned value will also contain the CA (issuer) certificate chain.
|
||||
//
|
||||
// In the case where CA server does not provide the issued certificate in the response,
|
||||
// CreateCert will poll certURL using c.FetchCert, which will result in additional round-trips.
|
||||
// In such scenario the caller can cancel the polling with ctx.
|
||||
//
|
||||
// CreateCert returns an error if the CA's response or chain was unreasonably large.
|
||||
// Callers are encouraged to parse the returned value to ensure the certificate is valid and has the expected features.
|
||||
func (c *Client) CreateCert(ctx context.Context, csr []byte, exp time.Duration, bundle bool) (der [][]byte, certURL string, err error) {
|
||||
if _, err := c.Discover(ctx); err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
|
||||
req := struct {
|
||||
Resource string `json:"resource"`
|
||||
CSR string `json:"csr"`
|
||||
NotBefore string `json:"notBefore,omitempty"`
|
||||
NotAfter string `json:"notAfter,omitempty"`
|
||||
}{
|
||||
Resource: "new-cert",
|
||||
CSR: base64.RawURLEncoding.EncodeToString(csr),
|
||||
}
|
||||
now := timeNow()
|
||||
req.NotBefore = now.Format(time.RFC3339)
|
||||
if exp > 0 {
|
||||
req.NotAfter = now.Add(exp).Format(time.RFC3339)
|
||||
}
|
||||
|
||||
res, err := c.postJWS(ctx, c.Key, c.dir.CertURL, req)
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
if res.StatusCode != http.StatusCreated {
|
||||
return nil, "", responseError(res)
|
||||
}
|
||||
|
||||
curl := res.Header.Get("location") // cert permanent URL
|
||||
if res.ContentLength == 0 {
|
||||
// no cert in the body; poll until we get it
|
||||
cert, err := c.FetchCert(ctx, curl, bundle)
|
||||
return cert, curl, err
|
||||
}
|
||||
// slurp issued cert and CA chain, if requested
|
||||
cert, err := responseCert(ctx, c.HTTPClient, res, bundle)
|
||||
return cert, curl, err
|
||||
}
|
||||
|
||||
// FetchCert retrieves already issued certificate from the given url, in DER format.
|
||||
// It retries the request until the certificate is successfully retrieved,
|
||||
// context is cancelled by the caller or an error response is received.
|
||||
//
|
||||
// The returned value will also contain the CA (issuer) certificate if the bundle argument is true.
|
||||
//
|
||||
// FetchCert returns an error if the CA's response or chain was unreasonably large.
|
||||
// Callers are encouraged to parse the returned value to ensure the certificate is valid
|
||||
// and has expected features.
|
||||
func (c *Client) FetchCert(ctx context.Context, url string, bundle bool) ([][]byte, error) {
|
||||
for {
|
||||
res, err := ctxhttp.Get(ctx, c.HTTPClient, url)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
if res.StatusCode == http.StatusOK {
|
||||
return responseCert(ctx, c.HTTPClient, res, bundle)
|
||||
}
|
||||
if res.StatusCode > 299 {
|
||||
return nil, responseError(res)
|
||||
}
|
||||
d := retryAfter(res.Header.Get("retry-after"), 3*time.Second)
|
||||
select {
|
||||
case <-time.After(d):
|
||||
// retry
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// RevokeCert revokes a previously issued certificate cert, provided in DER format.
|
||||
//
|
||||
// The key argument, used to sign the request, must be authorized
|
||||
// to revoke the certificate. It's up to the CA to decide which keys are authorized.
|
||||
// For instance, the key pair of the certificate may be authorized.
|
||||
// If the key is nil, c.Key is used instead.
|
||||
func (c *Client) RevokeCert(ctx context.Context, key crypto.Signer, cert []byte, reason CRLReasonCode) error {
|
||||
if _, err := c.Discover(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
body := &struct {
|
||||
Resource string `json:"resource"`
|
||||
Cert string `json:"certificate"`
|
||||
Reason int `json:"reason"`
|
||||
}{
|
||||
Resource: "revoke-cert",
|
||||
Cert: base64.RawURLEncoding.EncodeToString(cert),
|
||||
Reason: int(reason),
|
||||
}
|
||||
if key == nil {
|
||||
key = c.Key
|
||||
}
|
||||
res, err := c.postJWS(ctx, key, c.dir.RevokeURL, body)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
if res.StatusCode != http.StatusOK {
|
||||
return responseError(res)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// AcceptTOS always returns true to indicate the acceptance of a CA's Terms of Service
|
||||
// during account registration. See Register method of Client for more details.
|
||||
func AcceptTOS(tosURL string) bool { return true }
|
||||
|
||||
// Register creates a new account registration by following the "new-reg" flow.
|
||||
// It returns registered account. The a argument is not modified.
|
||||
//
|
||||
// The registration may require the caller to agree to the CA's Terms of Service (TOS).
|
||||
// If so, and the account has not indicated the acceptance of the terms (see Account for details),
|
||||
// Register calls prompt with a TOS URL provided by the CA. Prompt should report
|
||||
// whether the caller agrees to the terms. To always accept the terms, the caller can use AcceptTOS.
|
||||
func (c *Client) Register(ctx context.Context, a *Account, prompt func(tosURL string) bool) (*Account, error) {
|
||||
if _, err := c.Discover(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var err error
|
||||
if a, err = c.doReg(ctx, c.dir.RegURL, "new-reg", a); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var accept bool
|
||||
if a.CurrentTerms != "" && a.CurrentTerms != a.AgreedTerms {
|
||||
accept = prompt(a.CurrentTerms)
|
||||
}
|
||||
if accept {
|
||||
a.AgreedTerms = a.CurrentTerms
|
||||
a, err = c.UpdateReg(ctx, a)
|
||||
}
|
||||
return a, err
|
||||
}
|
||||
|
||||
// GetReg retrieves an existing registration.
|
||||
// The url argument is an Account URI.
|
||||
func (c *Client) GetReg(ctx context.Context, url string) (*Account, error) {
|
||||
a, err := c.doReg(ctx, url, "reg", nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
a.URI = url
|
||||
return a, nil
|
||||
}
|
||||
|
||||
// UpdateReg updates an existing registration.
|
||||
// It returns an updated account copy. The provided account is not modified.
|
||||
func (c *Client) UpdateReg(ctx context.Context, a *Account) (*Account, error) {
|
||||
uri := a.URI
|
||||
a, err := c.doReg(ctx, uri, "reg", a)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
a.URI = uri
|
||||
return a, nil
|
||||
}
|
||||
|
||||
// Authorize performs the initial step in an authorization flow.
|
||||
// The caller will then need to choose from and perform a set of returned
|
||||
// challenges using c.Accept in order to successfully complete authorization.
|
||||
//
|
||||
// If an authorization has been previously granted, the CA may return
|
||||
// a valid authorization (Authorization.Status is StatusValid). If so, the caller
|
||||
// need not fulfill any challenge and can proceed to requesting a certificate.
|
||||
func (c *Client) Authorize(ctx context.Context, domain string) (*Authorization, error) {
|
||||
if _, err := c.Discover(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
type authzID struct {
|
||||
Type string `json:"type"`
|
||||
Value string `json:"value"`
|
||||
}
|
||||
req := struct {
|
||||
Resource string `json:"resource"`
|
||||
Identifier authzID `json:"identifier"`
|
||||
}{
|
||||
Resource: "new-authz",
|
||||
Identifier: authzID{Type: "dns", Value: domain},
|
||||
}
|
||||
res, err := c.postJWS(ctx, c.Key, c.dir.AuthzURL, req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
if res.StatusCode != http.StatusCreated {
|
||||
return nil, responseError(res)
|
||||
}
|
||||
|
||||
var v wireAuthz
|
||||
if err := json.NewDecoder(res.Body).Decode(&v); err != nil {
|
||||
return nil, fmt.Errorf("acme: invalid response: %v", err)
|
||||
}
|
||||
if v.Status != StatusPending && v.Status != StatusValid {
|
||||
return nil, fmt.Errorf("acme: unexpected status: %s", v.Status)
|
||||
}
|
||||
return v.authorization(res.Header.Get("Location")), nil
|
||||
}
|
||||
|
||||
// GetAuthorization retrieves an authorization identified by the given URL.
|
||||
//
|
||||
// If a caller needs to poll an authorization until its status is final,
|
||||
// see the WaitAuthorization method.
|
||||
func (c *Client) GetAuthorization(ctx context.Context, url string) (*Authorization, error) {
|
||||
res, err := ctxhttp.Get(ctx, c.HTTPClient, url)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
if res.StatusCode != http.StatusOK && res.StatusCode != http.StatusAccepted {
|
||||
return nil, responseError(res)
|
||||
}
|
||||
var v wireAuthz
|
||||
if err := json.NewDecoder(res.Body).Decode(&v); err != nil {
|
||||
return nil, fmt.Errorf("acme: invalid response: %v", err)
|
||||
}
|
||||
return v.authorization(url), nil
|
||||
}
|
||||
|
||||
// RevokeAuthorization relinquishes an existing authorization identified
|
||||
// by the given URL.
|
||||
// The url argument is an Authorization.URI value.
|
||||
//
|
||||
// If successful, the caller will be required to obtain a new authorization
|
||||
// using the Authorize method before being able to request a new certificate
|
||||
// for the domain associated with the authorization.
|
||||
//
|
||||
// It does not revoke existing certificates.
|
||||
func (c *Client) RevokeAuthorization(ctx context.Context, url string) error {
|
||||
req := struct {
|
||||
Resource string `json:"resource"`
|
||||
Status string `json:"status"`
|
||||
Delete bool `json:"delete"`
|
||||
}{
|
||||
Resource: "authz",
|
||||
Status: "deactivated",
|
||||
Delete: true,
|
||||
}
|
||||
res, err := c.postJWS(ctx, c.Key, url, req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
if res.StatusCode != http.StatusOK {
|
||||
return responseError(res)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// WaitAuthorization polls an authorization at the given URL
|
||||
// until it is in one of the final states, StatusValid or StatusInvalid,
|
||||
// or the context is done.
|
||||
//
|
||||
// It returns a non-nil Authorization only if its Status is StatusValid.
|
||||
// In all other cases WaitAuthorization returns an error.
|
||||
// If the Status is StatusInvalid, the returned error is ErrAuthorizationFailed.
|
||||
func (c *Client) WaitAuthorization(ctx context.Context, url string) (*Authorization, error) {
|
||||
var count int
|
||||
sleep := func(v string, inc int) error {
|
||||
count += inc
|
||||
d := backoff(count, 10*time.Second)
|
||||
d = retryAfter(v, d)
|
||||
wakeup := time.NewTimer(d)
|
||||
defer wakeup.Stop()
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-wakeup.C:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
for {
|
||||
res, err := ctxhttp.Get(ctx, c.HTTPClient, url)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
retry := res.Header.Get("retry-after")
|
||||
if res.StatusCode != http.StatusOK && res.StatusCode != http.StatusAccepted {
|
||||
res.Body.Close()
|
||||
if err := sleep(retry, 1); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
continue
|
||||
}
|
||||
var raw wireAuthz
|
||||
err = json.NewDecoder(res.Body).Decode(&raw)
|
||||
res.Body.Close()
|
||||
if err != nil {
|
||||
if err := sleep(retry, 0); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
continue
|
||||
}
|
||||
if raw.Status == StatusValid {
|
||||
return raw.authorization(url), nil
|
||||
}
|
||||
if raw.Status == StatusInvalid {
|
||||
return nil, ErrAuthorizationFailed
|
||||
}
|
||||
if err := sleep(retry, 0); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// GetChallenge retrieves the current status of an challenge.
|
||||
//
|
||||
// A client typically polls a challenge status using this method.
|
||||
func (c *Client) GetChallenge(ctx context.Context, url string) (*Challenge, error) {
|
||||
res, err := ctxhttp.Get(ctx, c.HTTPClient, url)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
if res.StatusCode != http.StatusOK && res.StatusCode != http.StatusAccepted {
|
||||
return nil, responseError(res)
|
||||
}
|
||||
v := wireChallenge{URI: url}
|
||||
if err := json.NewDecoder(res.Body).Decode(&v); err != nil {
|
||||
return nil, fmt.Errorf("acme: invalid response: %v", err)
|
||||
}
|
||||
return v.challenge(), nil
|
||||
}
|
||||
|
||||
// Accept informs the server that the client accepts one of its challenges
|
||||
// previously obtained with c.Authorize.
|
||||
//
|
||||
// The server will then perform the validation asynchronously.
|
||||
func (c *Client) Accept(ctx context.Context, chal *Challenge) (*Challenge, error) {
|
||||
auth, err := keyAuth(c.Key.Public(), chal.Token)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req := struct {
|
||||
Resource string `json:"resource"`
|
||||
Type string `json:"type"`
|
||||
Auth string `json:"keyAuthorization"`
|
||||
}{
|
||||
Resource: "challenge",
|
||||
Type: chal.Type,
|
||||
Auth: auth,
|
||||
}
|
||||
res, err := c.postJWS(ctx, c.Key, chal.URI, req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
// Note: the protocol specifies 200 as the expected response code, but
|
||||
// letsencrypt seems to be returning 202.
|
||||
if res.StatusCode != http.StatusOK && res.StatusCode != http.StatusAccepted {
|
||||
return nil, responseError(res)
|
||||
}
|
||||
|
||||
var v wireChallenge
|
||||
if err := json.NewDecoder(res.Body).Decode(&v); err != nil {
|
||||
return nil, fmt.Errorf("acme: invalid response: %v", err)
|
||||
}
|
||||
return v.challenge(), nil
|
||||
}
|
||||
|
||||
// DNS01ChallengeRecord returns a DNS record value for a dns-01 challenge response.
|
||||
// A TXT record containing the returned value must be provisioned under
|
||||
// "_acme-challenge" name of the domain being validated.
|
||||
//
|
||||
// The token argument is a Challenge.Token value.
|
||||
func (c *Client) DNS01ChallengeRecord(token string) (string, error) {
|
||||
ka, err := keyAuth(c.Key.Public(), token)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
b := sha256.Sum256([]byte(ka))
|
||||
return base64.RawURLEncoding.EncodeToString(b[:]), nil
|
||||
}
|
||||
|
||||
// HTTP01ChallengeResponse returns the response for an http-01 challenge.
|
||||
// Servers should respond with the value to HTTP requests at the URL path
|
||||
// provided by HTTP01ChallengePath to validate the challenge and prove control
|
||||
// over a domain name.
|
||||
//
|
||||
// The token argument is a Challenge.Token value.
|
||||
func (c *Client) HTTP01ChallengeResponse(token string) (string, error) {
|
||||
return keyAuth(c.Key.Public(), token)
|
||||
}
|
||||
|
||||
// HTTP01ChallengePath returns the URL path at which the response for an http-01 challenge
|
||||
// should be provided by the servers.
|
||||
// The response value can be obtained with HTTP01ChallengeResponse.
|
||||
//
|
||||
// The token argument is a Challenge.Token value.
|
||||
func (c *Client) HTTP01ChallengePath(token string) string {
|
||||
return "/.well-known/acme-challenge/" + token
|
||||
}
|
||||
|
||||
// TLSSNI01ChallengeCert creates a certificate for TLS-SNI-01 challenge response.
|
||||
// Servers can present the certificate to validate the challenge and prove control
|
||||
// over a domain name.
|
||||
//
|
||||
// The implementation is incomplete in that the returned value is a single certificate,
|
||||
// computed only for Z0 of the key authorization. ACME CAs are expected to update
|
||||
// their implementations to use the newer version, TLS-SNI-02.
|
||||
// For more details on TLS-SNI-01 see https://tools.ietf.org/html/draft-ietf-acme-acme-01#section-7.3.
|
||||
//
|
||||
// The token argument is a Challenge.Token value.
|
||||
// If a WithKey option is provided, its private part signs the returned cert,
|
||||
// and the public part is used to specify the signee.
|
||||
// If no WithKey option is provided, a new ECDSA key is generated using P-256 curve.
|
||||
//
|
||||
// The returned certificate is valid for the next 24 hours and must be presented only when
|
||||
// the server name of the client hello matches exactly the returned name value.
|
||||
func (c *Client) TLSSNI01ChallengeCert(token string, opt ...CertOption) (cert tls.Certificate, name string, err error) {
|
||||
ka, err := keyAuth(c.Key.Public(), token)
|
||||
if err != nil {
|
||||
return tls.Certificate{}, "", err
|
||||
}
|
||||
b := sha256.Sum256([]byte(ka))
|
||||
h := hex.EncodeToString(b[:])
|
||||
name = fmt.Sprintf("%s.%s.acme.invalid", h[:32], h[32:])
|
||||
cert, err = tlsChallengeCert([]string{name}, opt)
|
||||
if err != nil {
|
||||
return tls.Certificate{}, "", err
|
||||
}
|
||||
return cert, name, nil
|
||||
}
|
||||
|
||||
// TLSSNI02ChallengeCert creates a certificate for TLS-SNI-02 challenge response.
|
||||
// Servers can present the certificate to validate the challenge and prove control
|
||||
// over a domain name. For more details on TLS-SNI-02 see
|
||||
// https://tools.ietf.org/html/draft-ietf-acme-acme-03#section-7.3.
|
||||
//
|
||||
// The token argument is a Challenge.Token value.
|
||||
// If a WithKey option is provided, its private part signs the returned cert,
|
||||
// and the public part is used to specify the signee.
|
||||
// If no WithKey option is provided, a new ECDSA key is generated using P-256 curve.
|
||||
//
|
||||
// The returned certificate is valid for the next 24 hours and must be presented only when
|
||||
// the server name in the client hello matches exactly the returned name value.
|
||||
func (c *Client) TLSSNI02ChallengeCert(token string, opt ...CertOption) (cert tls.Certificate, name string, err error) {
|
||||
b := sha256.Sum256([]byte(token))
|
||||
h := hex.EncodeToString(b[:])
|
||||
sanA := fmt.Sprintf("%s.%s.token.acme.invalid", h[:32], h[32:])
|
||||
|
||||
ka, err := keyAuth(c.Key.Public(), token)
|
||||
if err != nil {
|
||||
return tls.Certificate{}, "", err
|
||||
}
|
||||
b = sha256.Sum256([]byte(ka))
|
||||
h = hex.EncodeToString(b[:])
|
||||
sanB := fmt.Sprintf("%s.%s.ka.acme.invalid", h[:32], h[32:])
|
||||
|
||||
cert, err = tlsChallengeCert([]string{sanA, sanB}, opt)
|
||||
if err != nil {
|
||||
return tls.Certificate{}, "", err
|
||||
}
|
||||
return cert, sanA, nil
|
||||
}
|
||||
|
||||
// doReg sends all types of registration requests.
|
||||
// The type of request is identified by typ argument, which is a "resource"
|
||||
// in the ACME spec terms.
|
||||
//
|
||||
// A non-nil acct argument indicates whether the intention is to mutate data
|
||||
// of the Account. Only Contact and Agreement of its fields are used
|
||||
// in such cases.
|
||||
func (c *Client) doReg(ctx context.Context, url string, typ string, acct *Account) (*Account, error) {
|
||||
req := struct {
|
||||
Resource string `json:"resource"`
|
||||
Contact []string `json:"contact,omitempty"`
|
||||
Agreement string `json:"agreement,omitempty"`
|
||||
}{
|
||||
Resource: typ,
|
||||
}
|
||||
if acct != nil {
|
||||
req.Contact = acct.Contact
|
||||
req.Agreement = acct.AgreedTerms
|
||||
}
|
||||
res, err := c.postJWS(ctx, c.Key, url, req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
if res.StatusCode < 200 || res.StatusCode > 299 {
|
||||
return nil, responseError(res)
|
||||
}
|
||||
|
||||
var v struct {
|
||||
Contact []string
|
||||
Agreement string
|
||||
Authorizations string
|
||||
Certificates string
|
||||
}
|
||||
if err := json.NewDecoder(res.Body).Decode(&v); err != nil {
|
||||
return nil, fmt.Errorf("acme: invalid response: %v", err)
|
||||
}
|
||||
var tos string
|
||||
if v := linkHeader(res.Header, "terms-of-service"); len(v) > 0 {
|
||||
tos = v[0]
|
||||
}
|
||||
var authz string
|
||||
if v := linkHeader(res.Header, "next"); len(v) > 0 {
|
||||
authz = v[0]
|
||||
}
|
||||
return &Account{
|
||||
URI: res.Header.Get("Location"),
|
||||
Contact: v.Contact,
|
||||
AgreedTerms: v.Agreement,
|
||||
CurrentTerms: tos,
|
||||
Authz: authz,
|
||||
Authorizations: v.Authorizations,
|
||||
Certificates: v.Certificates,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// postJWS signs the body with the given key and POSTs it to the provided url.
|
||||
// The body argument must be JSON-serializable.
|
||||
func (c *Client) postJWS(ctx context.Context, key crypto.Signer, url string, body interface{}) (*http.Response, error) {
|
||||
nonce, err := c.popNonce(ctx, url)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
b, err := jwsEncodeJSON(body, key, nonce)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
res, err := ctxhttp.Post(ctx, c.HTTPClient, url, "application/jose+json", bytes.NewReader(b))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
c.addNonce(res.Header)
|
||||
return res, nil
|
||||
}
|
||||
|
||||
// popNonce returns a nonce value previously stored with c.addNonce
|
||||
// or fetches a fresh one from the given URL.
|
||||
func (c *Client) popNonce(ctx context.Context, url string) (string, error) {
|
||||
c.noncesMu.Lock()
|
||||
defer c.noncesMu.Unlock()
|
||||
if len(c.nonces) == 0 {
|
||||
return fetchNonce(ctx, c.HTTPClient, url)
|
||||
}
|
||||
var nonce string
|
||||
for nonce = range c.nonces {
|
||||
delete(c.nonces, nonce)
|
||||
break
|
||||
}
|
||||
return nonce, nil
|
||||
}
|
||||
|
||||
// addNonce stores a nonce value found in h (if any) for future use.
|
||||
func (c *Client) addNonce(h http.Header) {
|
||||
v := nonceFromHeader(h)
|
||||
if v == "" {
|
||||
return
|
||||
}
|
||||
c.noncesMu.Lock()
|
||||
defer c.noncesMu.Unlock()
|
||||
if len(c.nonces) >= maxNonces {
|
||||
return
|
||||
}
|
||||
if c.nonces == nil {
|
||||
c.nonces = make(map[string]struct{})
|
||||
}
|
||||
c.nonces[v] = struct{}{}
|
||||
}
|
||||
|
||||
func fetchNonce(ctx context.Context, client *http.Client, url string) (string, error) {
|
||||
resp, err := ctxhttp.Head(ctx, client, url)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
nonce := nonceFromHeader(resp.Header)
|
||||
if nonce == "" {
|
||||
if resp.StatusCode > 299 {
|
||||
return "", responseError(resp)
|
||||
}
|
||||
return "", errors.New("acme: nonce not found")
|
||||
}
|
||||
return nonce, nil
|
||||
}
|
||||
|
||||
func nonceFromHeader(h http.Header) string {
|
||||
return h.Get("Replay-Nonce")
|
||||
}
|
||||
|
||||
func responseCert(ctx context.Context, client *http.Client, res *http.Response, bundle bool) ([][]byte, error) {
|
||||
b, err := ioutil.ReadAll(io.LimitReader(res.Body, maxCertSize+1))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("acme: response stream: %v", err)
|
||||
}
|
||||
if len(b) > maxCertSize {
|
||||
return nil, errors.New("acme: certificate is too big")
|
||||
}
|
||||
cert := [][]byte{b}
|
||||
if !bundle {
|
||||
return cert, nil
|
||||
}
|
||||
|
||||
// Append CA chain cert(s).
|
||||
// At least one is required according to the spec:
|
||||
// https://tools.ietf.org/html/draft-ietf-acme-acme-03#section-6.3.1
|
||||
up := linkHeader(res.Header, "up")
|
||||
if len(up) == 0 {
|
||||
return nil, errors.New("acme: rel=up link not found")
|
||||
}
|
||||
if len(up) > maxChainLen {
|
||||
return nil, errors.New("acme: rel=up link is too large")
|
||||
}
|
||||
for _, url := range up {
|
||||
cc, err := chainCert(ctx, client, url, 0)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
cert = append(cert, cc...)
|
||||
}
|
||||
return cert, nil
|
||||
}
|
||||
|
||||
// responseError creates an error of Error type from resp.
|
||||
func responseError(resp *http.Response) error {
|
||||
// don't care if ReadAll returns an error:
|
||||
// json.Unmarshal will fail in that case anyway
|
||||
b, _ := ioutil.ReadAll(resp.Body)
|
||||
e := struct {
|
||||
Status int
|
||||
Type string
|
||||
Detail string
|
||||
}{
|
||||
Status: resp.StatusCode,
|
||||
}
|
||||
if err := json.Unmarshal(b, &e); err != nil {
|
||||
// this is not a regular error response:
|
||||
// populate detail with anything we received,
|
||||
// e.Status will already contain HTTP response code value
|
||||
e.Detail = string(b)
|
||||
if e.Detail == "" {
|
||||
e.Detail = resp.Status
|
||||
}
|
||||
}
|
||||
return &Error{
|
||||
StatusCode: e.Status,
|
||||
ProblemType: e.Type,
|
||||
Detail: e.Detail,
|
||||
Header: resp.Header,
|
||||
}
|
||||
}
|
||||
|
||||
// chainCert fetches CA certificate chain recursively by following "up" links.
|
||||
// Each recursive call increments the depth by 1, resulting in an error
|
||||
// if the recursion level reaches maxChainLen.
|
||||
//
|
||||
// First chainCert call starts with depth of 0.
|
||||
func chainCert(ctx context.Context, client *http.Client, url string, depth int) ([][]byte, error) {
|
||||
if depth >= maxChainLen {
|
||||
return nil, errors.New("acme: certificate chain is too deep")
|
||||
}
|
||||
|
||||
res, err := ctxhttp.Get(ctx, client, url)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
if res.StatusCode != http.StatusOK {
|
||||
return nil, responseError(res)
|
||||
}
|
||||
b, err := ioutil.ReadAll(io.LimitReader(res.Body, maxCertSize+1))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(b) > maxCertSize {
|
||||
return nil, errors.New("acme: certificate is too big")
|
||||
}
|
||||
chain := [][]byte{b}
|
||||
|
||||
uplink := linkHeader(res.Header, "up")
|
||||
if len(uplink) > maxChainLen {
|
||||
return nil, errors.New("acme: certificate chain is too large")
|
||||
}
|
||||
for _, up := range uplink {
|
||||
cc, err := chainCert(ctx, client, up, depth+1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
chain = append(chain, cc...)
|
||||
}
|
||||
|
||||
return chain, nil
|
||||
}
|
||||
|
||||
// linkHeader returns URI-Reference values of all Link headers
|
||||
// with relation-type rel.
|
||||
// See https://tools.ietf.org/html/rfc5988#section-5 for details.
|
||||
func linkHeader(h http.Header, rel string) []string {
|
||||
var links []string
|
||||
for _, v := range h["Link"] {
|
||||
parts := strings.Split(v, ";")
|
||||
for _, p := range parts {
|
||||
p = strings.TrimSpace(p)
|
||||
if !strings.HasPrefix(p, "rel=") {
|
||||
continue
|
||||
}
|
||||
if v := strings.Trim(p[4:], `"`); v == rel {
|
||||
links = append(links, strings.Trim(parts[0], "<>"))
|
||||
}
|
||||
}
|
||||
}
|
||||
return links
|
||||
}
|
||||
|
||||
// retryAfter parses a Retry-After HTTP header value,
|
||||
// trying to convert v into an int (seconds) or use http.ParseTime otherwise.
|
||||
// It returns d if v cannot be parsed.
|
||||
func retryAfter(v string, d time.Duration) time.Duration {
|
||||
if i, err := strconv.Atoi(v); err == nil {
|
||||
return time.Duration(i) * time.Second
|
||||
}
|
||||
t, err := http.ParseTime(v)
|
||||
if err != nil {
|
||||
return d
|
||||
}
|
||||
return t.Sub(timeNow())
|
||||
}
|
||||
|
||||
// backoff computes a duration after which an n+1 retry iteration should occur
|
||||
// using truncated exponential backoff algorithm.
|
||||
//
|
||||
// The n argument is always bounded between 0 and 30.
|
||||
// The max argument defines upper bound for the returned value.
|
||||
func backoff(n int, max time.Duration) time.Duration {
|
||||
if n < 0 {
|
||||
n = 0
|
||||
}
|
||||
if n > 30 {
|
||||
n = 30
|
||||
}
|
||||
var d time.Duration
|
||||
if x, err := rand.Int(rand.Reader, big.NewInt(1000)); err == nil {
|
||||
d = time.Duration(x.Int64()) * time.Millisecond
|
||||
}
|
||||
d += time.Duration(1<<uint(n)) * time.Second
|
||||
if d > max {
|
||||
return max
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
// keyAuth generates a key authorization string for a given token.
|
||||
func keyAuth(pub crypto.PublicKey, token string) (string, error) {
|
||||
th, err := JWKThumbprint(pub)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return fmt.Sprintf("%s.%s", token, th), nil
|
||||
}
|
||||
|
||||
// tlsChallengeCert creates a temporary certificate for TLS-SNI challenges
|
||||
// with the given SANs and auto-generated public/private key pair.
|
||||
// To create a cert with a custom key pair, specify WithKey option.
|
||||
func tlsChallengeCert(san []string, opt []CertOption) (tls.Certificate, error) {
|
||||
var (
|
||||
key crypto.Signer
|
||||
tmpl *x509.Certificate
|
||||
)
|
||||
for _, o := range opt {
|
||||
switch o := o.(type) {
|
||||
case *certOptKey:
|
||||
if key != nil {
|
||||
return tls.Certificate{}, errors.New("acme: duplicate key option")
|
||||
}
|
||||
key = o.key
|
||||
case *certOptTemplate:
|
||||
var t = *(*x509.Certificate)(o) // shallow copy is ok
|
||||
tmpl = &t
|
||||
default:
|
||||
// package's fault, if we let this happen:
|
||||
panic(fmt.Sprintf("unsupported option type %T", o))
|
||||
}
|
||||
}
|
||||
if key == nil {
|
||||
var err error
|
||||
if key, err = ecdsa.GenerateKey(elliptic.P256(), rand.Reader); err != nil {
|
||||
return tls.Certificate{}, err
|
||||
}
|
||||
}
|
||||
if tmpl == nil {
|
||||
tmpl = &x509.Certificate{
|
||||
SerialNumber: big.NewInt(1),
|
||||
NotBefore: time.Now(),
|
||||
NotAfter: time.Now().Add(24 * time.Hour),
|
||||
BasicConstraintsValid: true,
|
||||
KeyUsage: x509.KeyUsageKeyEncipherment,
|
||||
}
|
||||
}
|
||||
tmpl.DNSNames = san
|
||||
|
||||
der, err := x509.CreateCertificate(rand.Reader, tmpl, tmpl, key.Public(), key)
|
||||
if err != nil {
|
||||
return tls.Certificate{}, err
|
||||
}
|
||||
return tls.Certificate{
|
||||
Certificate: [][]byte{der},
|
||||
PrivateKey: key,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// encodePEM returns b encoded as PEM with block of type typ.
|
||||
func encodePEM(typ string, b []byte) []byte {
|
||||
pb := &pem.Block{Type: typ, Bytes: b}
|
||||
return pem.EncodeToMemory(pb)
|
||||
}
|
||||
|
||||
// timeNow is useful for testing for fixed current time.
|
||||
var timeNow = time.Now
|
||||
+793
@@ -0,0 +1,793 @@
|
||||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package autocert provides automatic access to certificates from Let's Encrypt
|
||||
// and any other ACME-based CA.
|
||||
//
|
||||
// This package is a work in progress and makes no API stability promises.
|
||||
package autocert
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto"
|
||||
"crypto/ecdsa"
|
||||
"crypto/elliptic"
|
||||
"crypto/rand"
|
||||
"crypto/rsa"
|
||||
"crypto/tls"
|
||||
"crypto/x509"
|
||||
"crypto/x509/pkix"
|
||||
"encoding/pem"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
mathrand "math/rand"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"golang.org/x/crypto/acme"
|
||||
"golang.org/x/net/context"
|
||||
)
|
||||
|
||||
// pseudoRand is safe for concurrent use.
|
||||
var pseudoRand *lockedMathRand
|
||||
|
||||
func init() {
|
||||
src := mathrand.NewSource(timeNow().UnixNano())
|
||||
pseudoRand = &lockedMathRand{rnd: mathrand.New(src)}
|
||||
}
|
||||
|
||||
// AcceptTOS always returns true to indicate the acceptance of a CA Terms of Service
|
||||
// during account registration.
|
||||
func AcceptTOS(tosURL string) bool { return true }
|
||||
|
||||
// HostPolicy specifies which host names the Manager is allowed to respond to.
|
||||
// It returns a non-nil error if the host should be rejected.
|
||||
// The returned error is accessible via tls.Conn.Handshake and its callers.
|
||||
// See Manager's HostPolicy field and GetCertificate method docs for more details.
|
||||
type HostPolicy func(ctx context.Context, host string) error
|
||||
|
||||
// HostWhitelist returns a policy where only the specified host names are allowed.
|
||||
// Only exact matches are currently supported. Subdomains, regexp or wildcard
|
||||
// will not match.
|
||||
func HostWhitelist(hosts ...string) HostPolicy {
|
||||
whitelist := make(map[string]bool, len(hosts))
|
||||
for _, h := range hosts {
|
||||
whitelist[h] = true
|
||||
}
|
||||
return func(_ context.Context, host string) error {
|
||||
if !whitelist[host] {
|
||||
return errors.New("acme/autocert: host not configured")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// defaultHostPolicy is used when Manager.HostPolicy is not set.
|
||||
func defaultHostPolicy(context.Context, string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Manager is a stateful certificate manager built on top of acme.Client.
|
||||
// It obtains and refreshes certificates automatically,
|
||||
// as well as providing them to a TLS server via tls.Config.
|
||||
//
|
||||
// A simple usage example:
|
||||
//
|
||||
// m := autocert.Manager{
|
||||
// Prompt: autocert.AcceptTOS,
|
||||
// HostPolicy: autocert.HostWhitelist("example.org"),
|
||||
// }
|
||||
// s := &http.Server{
|
||||
// Addr: ":https",
|
||||
// TLSConfig: &tls.Config{GetCertificate: m.GetCertificate},
|
||||
// }
|
||||
// s.ListenAndServeTLS("", "")
|
||||
//
|
||||
// To preserve issued certificates and improve overall performance,
|
||||
// use a cache implementation of Cache. For instance, DirCache.
|
||||
type Manager struct {
|
||||
// Prompt specifies a callback function to conditionally accept a CA's Terms of Service (TOS).
|
||||
// The registration may require the caller to agree to the CA's TOS.
|
||||
// If so, Manager calls Prompt with a TOS URL provided by the CA. Prompt should report
|
||||
// whether the caller agrees to the terms.
|
||||
//
|
||||
// To always accept the terms, the callers can use AcceptTOS.
|
||||
Prompt func(tosURL string) bool
|
||||
|
||||
// Cache optionally stores and retrieves previously-obtained certificates.
|
||||
// If nil, certs will only be cached for the lifetime of the Manager.
|
||||
//
|
||||
// Manager passes the Cache certificates data encoded in PEM, with private/public
|
||||
// parts combined in a single Cache.Put call, private key first.
|
||||
Cache Cache
|
||||
|
||||
// HostPolicy controls which domains the Manager will attempt
|
||||
// to retrieve new certificates for. It does not affect cached certs.
|
||||
//
|
||||
// If non-nil, HostPolicy is called before requesting a new cert.
|
||||
// If nil, all hosts are currently allowed. This is not recommended,
|
||||
// as it opens a potential attack where clients connect to a server
|
||||
// by IP address and pretend to be asking for an incorrect host name.
|
||||
// Manager will attempt to obtain a certificate for that host, incorrectly,
|
||||
// eventually reaching the CA's rate limit for certificate requests
|
||||
// and making it impossible to obtain actual certificates.
|
||||
//
|
||||
// See GetCertificate for more details.
|
||||
HostPolicy HostPolicy
|
||||
|
||||
// RenewBefore optionally specifies how early certificates should
|
||||
// be renewed before they expire.
|
||||
//
|
||||
// If zero, they're renewed 1 week before expiration.
|
||||
RenewBefore time.Duration
|
||||
|
||||
// Client is used to perform low-level operations, such as account registration
|
||||
// and requesting new certificates.
|
||||
// If Client is nil, a zero-value acme.Client is used with acme.LetsEncryptURL
|
||||
// directory endpoint and a newly-generated ECDSA P-256 key.
|
||||
//
|
||||
// Mutating the field after the first call of GetCertificate method will have no effect.
|
||||
Client *acme.Client
|
||||
|
||||
// Email optionally specifies a contact email address.
|
||||
// This is used by CAs, such as Let's Encrypt, to notify about problems
|
||||
// with issued certificates.
|
||||
//
|
||||
// If the Client's account key is already registered, Email is not used.
|
||||
Email string
|
||||
|
||||
// ForceRSA makes the Manager generate certificates with 2048-bit RSA keys.
|
||||
//
|
||||
// If false, a default is used. Currently the default
|
||||
// is EC-based keys using the P-256 curve.
|
||||
ForceRSA bool
|
||||
|
||||
clientMu sync.Mutex
|
||||
client *acme.Client // initialized by acmeClient method
|
||||
|
||||
stateMu sync.Mutex
|
||||
state map[string]*certState // keyed by domain name
|
||||
|
||||
// tokenCert is keyed by token domain name, which matches server name
|
||||
// of ClientHello. Keys always have ".acme.invalid" suffix.
|
||||
tokenCertMu sync.RWMutex
|
||||
tokenCert map[string]*tls.Certificate
|
||||
|
||||
// renewal tracks the set of domains currently running renewal timers.
|
||||
// It is keyed by domain name.
|
||||
renewalMu sync.Mutex
|
||||
renewal map[string]*domainRenewal
|
||||
}
|
||||
|
||||
// GetCertificate implements the tls.Config.GetCertificate hook.
|
||||
// It provides a TLS certificate for hello.ServerName host, including answering
|
||||
// *.acme.invalid (TLS-SNI) challenges. All other fields of hello are ignored.
|
||||
//
|
||||
// If m.HostPolicy is non-nil, GetCertificate calls the policy before requesting
|
||||
// a new cert. A non-nil error returned from m.HostPolicy halts TLS negotiation.
|
||||
// The error is propagated back to the caller of GetCertificate and is user-visible.
|
||||
// This does not affect cached certs. See HostPolicy field description for more details.
|
||||
func (m *Manager) GetCertificate(hello *tls.ClientHelloInfo) (*tls.Certificate, error) {
|
||||
name := hello.ServerName
|
||||
if name == "" {
|
||||
return nil, errors.New("acme/autocert: missing server name")
|
||||
}
|
||||
|
||||
// check whether this is a token cert requested for TLS-SNI challenge
|
||||
if strings.HasSuffix(name, ".acme.invalid") {
|
||||
m.tokenCertMu.RLock()
|
||||
defer m.tokenCertMu.RUnlock()
|
||||
if cert := m.tokenCert[name]; cert != nil {
|
||||
return cert, nil
|
||||
}
|
||||
if cert, err := m.cacheGet(name); err == nil {
|
||||
return cert, nil
|
||||
}
|
||||
// TODO: cache error results?
|
||||
return nil, fmt.Errorf("acme/autocert: no token cert for %q", name)
|
||||
}
|
||||
|
||||
// regular domain
|
||||
name = strings.TrimSuffix(name, ".") // golang.org/issue/18114
|
||||
cert, err := m.cert(name)
|
||||
if err == nil {
|
||||
return cert, nil
|
||||
}
|
||||
if err != ErrCacheMiss {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// first-time
|
||||
ctx := context.Background() // TODO: use a deadline?
|
||||
if err := m.hostPolicy()(ctx, name); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
cert, err = m.createCert(ctx, name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
m.cachePut(name, cert)
|
||||
return cert, nil
|
||||
}
|
||||
|
||||
// cert returns an existing certificate either from m.state or cache.
|
||||
// If a certificate is found in cache but not in m.state, the latter will be filled
|
||||
// with the cached value.
|
||||
func (m *Manager) cert(name string) (*tls.Certificate, error) {
|
||||
m.stateMu.Lock()
|
||||
if s, ok := m.state[name]; ok {
|
||||
m.stateMu.Unlock()
|
||||
s.RLock()
|
||||
defer s.RUnlock()
|
||||
return s.tlscert()
|
||||
}
|
||||
defer m.stateMu.Unlock()
|
||||
cert, err := m.cacheGet(name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
signer, ok := cert.PrivateKey.(crypto.Signer)
|
||||
if !ok {
|
||||
return nil, errors.New("acme/autocert: private key cannot sign")
|
||||
}
|
||||
if m.state == nil {
|
||||
m.state = make(map[string]*certState)
|
||||
}
|
||||
s := &certState{
|
||||
key: signer,
|
||||
cert: cert.Certificate,
|
||||
leaf: cert.Leaf,
|
||||
}
|
||||
m.state[name] = s
|
||||
go m.renew(name, s.key, s.leaf.NotAfter)
|
||||
return cert, nil
|
||||
}
|
||||
|
||||
// cacheGet always returns a valid certificate, or an error otherwise.
|
||||
func (m *Manager) cacheGet(domain string) (*tls.Certificate, error) {
|
||||
if m.Cache == nil {
|
||||
return nil, ErrCacheMiss
|
||||
}
|
||||
// TODO: might want to define a cache timeout on m
|
||||
ctx := context.Background()
|
||||
data, err := m.Cache.Get(ctx, domain)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// private
|
||||
priv, pub := pem.Decode(data)
|
||||
if priv == nil || !strings.Contains(priv.Type, "PRIVATE") {
|
||||
return nil, errors.New("acme/autocert: no private key found in cache")
|
||||
}
|
||||
privKey, err := parsePrivateKey(priv.Bytes)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// public
|
||||
var pubDER [][]byte
|
||||
for len(pub) > 0 {
|
||||
var b *pem.Block
|
||||
b, pub = pem.Decode(pub)
|
||||
if b == nil {
|
||||
break
|
||||
}
|
||||
pubDER = append(pubDER, b.Bytes)
|
||||
}
|
||||
if len(pub) > 0 {
|
||||
return nil, errors.New("acme/autocert: invalid public key")
|
||||
}
|
||||
|
||||
// verify and create TLS cert
|
||||
leaf, err := validCert(domain, pubDER, privKey)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
tlscert := &tls.Certificate{
|
||||
Certificate: pubDER,
|
||||
PrivateKey: privKey,
|
||||
Leaf: leaf,
|
||||
}
|
||||
return tlscert, nil
|
||||
}
|
||||
|
||||
func (m *Manager) cachePut(domain string, tlscert *tls.Certificate) error {
|
||||
if m.Cache == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
// contains PEM-encoded data
|
||||
var buf bytes.Buffer
|
||||
|
||||
// private
|
||||
switch key := tlscert.PrivateKey.(type) {
|
||||
case *ecdsa.PrivateKey:
|
||||
if err := encodeECDSAKey(&buf, key); err != nil {
|
||||
return err
|
||||
}
|
||||
case *rsa.PrivateKey:
|
||||
b := x509.MarshalPKCS1PrivateKey(key)
|
||||
pb := &pem.Block{Type: "RSA PRIVATE KEY", Bytes: b}
|
||||
if err := pem.Encode(&buf, pb); err != nil {
|
||||
return err
|
||||
}
|
||||
default:
|
||||
return errors.New("acme/autocert: unknown private key type")
|
||||
}
|
||||
|
||||
// public
|
||||
for _, b := range tlscert.Certificate {
|
||||
pb := &pem.Block{Type: "CERTIFICATE", Bytes: b}
|
||||
if err := pem.Encode(&buf, pb); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: might want to define a cache timeout on m
|
||||
ctx := context.Background()
|
||||
return m.Cache.Put(ctx, domain, buf.Bytes())
|
||||
}
|
||||
|
||||
func encodeECDSAKey(w io.Writer, key *ecdsa.PrivateKey) error {
|
||||
b, err := x509.MarshalECPrivateKey(key)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
pb := &pem.Block{Type: "EC PRIVATE KEY", Bytes: b}
|
||||
return pem.Encode(w, pb)
|
||||
}
|
||||
|
||||
// createCert starts the domain ownership verification and returns a certificate
|
||||
// for that domain upon success.
|
||||
//
|
||||
// If the domain is already being verified, it waits for the existing verification to complete.
|
||||
// Either way, createCert blocks for the duration of the whole process.
|
||||
func (m *Manager) createCert(ctx context.Context, domain string) (*tls.Certificate, error) {
|
||||
// TODO: maybe rewrite this whole piece using sync.Once
|
||||
state, err := m.certState(domain)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// state may exist if another goroutine is already working on it
|
||||
// in which case just wait for it to finish
|
||||
if !state.locked {
|
||||
state.RLock()
|
||||
defer state.RUnlock()
|
||||
return state.tlscert()
|
||||
}
|
||||
|
||||
// We are the first; state is locked.
|
||||
// Unblock the readers when domain ownership is verified
|
||||
// and the we got the cert or the process failed.
|
||||
defer state.Unlock()
|
||||
state.locked = false
|
||||
|
||||
der, leaf, err := m.authorizedCert(ctx, state.key, domain)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
state.cert = der
|
||||
state.leaf = leaf
|
||||
go m.renew(domain, state.key, state.leaf.NotAfter)
|
||||
return state.tlscert()
|
||||
}
|
||||
|
||||
// certState returns a new or existing certState.
|
||||
// If a new certState is returned, state.exist is false and the state is locked.
|
||||
// The returned error is non-nil only in the case where a new state could not be created.
|
||||
func (m *Manager) certState(domain string) (*certState, error) {
|
||||
m.stateMu.Lock()
|
||||
defer m.stateMu.Unlock()
|
||||
if m.state == nil {
|
||||
m.state = make(map[string]*certState)
|
||||
}
|
||||
// existing state
|
||||
if state, ok := m.state[domain]; ok {
|
||||
return state, nil
|
||||
}
|
||||
|
||||
// new locked state
|
||||
var (
|
||||
err error
|
||||
key crypto.Signer
|
||||
)
|
||||
if m.ForceRSA {
|
||||
key, err = rsa.GenerateKey(rand.Reader, 2048)
|
||||
} else {
|
||||
key, err = ecdsa.GenerateKey(elliptic.P256(), rand.Reader)
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
state := &certState{
|
||||
key: key,
|
||||
locked: true,
|
||||
}
|
||||
state.Lock() // will be unlocked by m.certState caller
|
||||
m.state[domain] = state
|
||||
return state, nil
|
||||
}
|
||||
|
||||
// authorizedCert starts domain ownership verification process and requests a new cert upon success.
|
||||
// The key argument is the certificate private key.
|
||||
func (m *Manager) authorizedCert(ctx context.Context, key crypto.Signer, domain string) (der [][]byte, leaf *x509.Certificate, err error) {
|
||||
// TODO: make m.verify retry or retry m.verify calls here
|
||||
if err := m.verify(ctx, domain); err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
client, err := m.acmeClient(ctx)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
csr, err := certRequest(key, domain)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
der, _, err = client.CreateCert(ctx, csr, 0, true)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
leaf, err = validCert(domain, der, key)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
return der, leaf, nil
|
||||
}
|
||||
|
||||
// verify starts a new identifier (domain) authorization flow.
|
||||
// It prepares a challenge response and then blocks until the authorization
|
||||
// is marked as "completed" by the CA (either succeeded or failed).
|
||||
//
|
||||
// verify returns nil iff the verification was successful.
|
||||
func (m *Manager) verify(ctx context.Context, domain string) error {
|
||||
client, err := m.acmeClient(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// start domain authorization and get the challenge
|
||||
authz, err := client.Authorize(ctx, domain)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// maybe don't need to at all
|
||||
if authz.Status == acme.StatusValid {
|
||||
return nil
|
||||
}
|
||||
|
||||
// pick a challenge: prefer tls-sni-02 over tls-sni-01
|
||||
// TODO: consider authz.Combinations
|
||||
var chal *acme.Challenge
|
||||
for _, c := range authz.Challenges {
|
||||
if c.Type == "tls-sni-02" {
|
||||
chal = c
|
||||
break
|
||||
}
|
||||
if c.Type == "tls-sni-01" {
|
||||
chal = c
|
||||
}
|
||||
}
|
||||
if chal == nil {
|
||||
return errors.New("acme/autocert: no supported challenge type found")
|
||||
}
|
||||
|
||||
// create a token cert for the challenge response
|
||||
var (
|
||||
cert tls.Certificate
|
||||
name string
|
||||
)
|
||||
switch chal.Type {
|
||||
case "tls-sni-01":
|
||||
cert, name, err = client.TLSSNI01ChallengeCert(chal.Token)
|
||||
case "tls-sni-02":
|
||||
cert, name, err = client.TLSSNI02ChallengeCert(chal.Token)
|
||||
default:
|
||||
err = fmt.Errorf("acme/autocert: unknown challenge type %q", chal.Type)
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
m.putTokenCert(name, &cert)
|
||||
defer func() {
|
||||
// verification has ended at this point
|
||||
// don't need token cert anymore
|
||||
go m.deleteTokenCert(name)
|
||||
}()
|
||||
|
||||
// ready to fulfill the challenge
|
||||
if _, err := client.Accept(ctx, chal); err != nil {
|
||||
return err
|
||||
}
|
||||
// wait for the CA to validate
|
||||
_, err = client.WaitAuthorization(ctx, authz.URI)
|
||||
return err
|
||||
}
|
||||
|
||||
// putTokenCert stores the cert under the named key in both m.tokenCert map
|
||||
// and m.Cache.
|
||||
func (m *Manager) putTokenCert(name string, cert *tls.Certificate) {
|
||||
m.tokenCertMu.Lock()
|
||||
defer m.tokenCertMu.Unlock()
|
||||
if m.tokenCert == nil {
|
||||
m.tokenCert = make(map[string]*tls.Certificate)
|
||||
}
|
||||
m.tokenCert[name] = cert
|
||||
m.cachePut(name, cert)
|
||||
}
|
||||
|
||||
// deleteTokenCert removes the token certificate for the specified domain name
|
||||
// from both m.tokenCert map and m.Cache.
|
||||
func (m *Manager) deleteTokenCert(name string) {
|
||||
m.tokenCertMu.Lock()
|
||||
defer m.tokenCertMu.Unlock()
|
||||
delete(m.tokenCert, name)
|
||||
if m.Cache != nil {
|
||||
m.Cache.Delete(context.Background(), name)
|
||||
}
|
||||
}
|
||||
|
||||
// renew starts a cert renewal timer loop, one per domain.
|
||||
//
|
||||
// The loop is scheduled in two cases:
|
||||
// - a cert was fetched from cache for the first time (wasn't in m.state)
|
||||
// - a new cert was created by m.createCert
|
||||
//
|
||||
// The key argument is a certificate private key.
|
||||
// The exp argument is the cert expiration time (NotAfter).
|
||||
func (m *Manager) renew(domain string, key crypto.Signer, exp time.Time) {
|
||||
m.renewalMu.Lock()
|
||||
defer m.renewalMu.Unlock()
|
||||
if m.renewal[domain] != nil {
|
||||
// another goroutine is already on it
|
||||
return
|
||||
}
|
||||
if m.renewal == nil {
|
||||
m.renewal = make(map[string]*domainRenewal)
|
||||
}
|
||||
dr := &domainRenewal{m: m, domain: domain, key: key}
|
||||
m.renewal[domain] = dr
|
||||
dr.start(exp)
|
||||
}
|
||||
|
||||
// stopRenew stops all currently running cert renewal timers.
|
||||
// The timers are not restarted during the lifetime of the Manager.
|
||||
func (m *Manager) stopRenew() {
|
||||
m.renewalMu.Lock()
|
||||
defer m.renewalMu.Unlock()
|
||||
for name, dr := range m.renewal {
|
||||
delete(m.renewal, name)
|
||||
dr.stop()
|
||||
}
|
||||
}
|
||||
|
||||
func (m *Manager) accountKey(ctx context.Context) (crypto.Signer, error) {
|
||||
const keyName = "acme_account.key"
|
||||
|
||||
genKey := func() (*ecdsa.PrivateKey, error) {
|
||||
return ecdsa.GenerateKey(elliptic.P256(), rand.Reader)
|
||||
}
|
||||
|
||||
if m.Cache == nil {
|
||||
return genKey()
|
||||
}
|
||||
|
||||
data, err := m.Cache.Get(ctx, keyName)
|
||||
if err == ErrCacheMiss {
|
||||
key, err := genKey()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var buf bytes.Buffer
|
||||
if err := encodeECDSAKey(&buf, key); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := m.Cache.Put(ctx, keyName, buf.Bytes()); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return key, nil
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
priv, _ := pem.Decode(data)
|
||||
if priv == nil || !strings.Contains(priv.Type, "PRIVATE") {
|
||||
return nil, errors.New("acme/autocert: invalid account key found in cache")
|
||||
}
|
||||
return parsePrivateKey(priv.Bytes)
|
||||
}
|
||||
|
||||
func (m *Manager) acmeClient(ctx context.Context) (*acme.Client, error) {
|
||||
m.clientMu.Lock()
|
||||
defer m.clientMu.Unlock()
|
||||
if m.client != nil {
|
||||
return m.client, nil
|
||||
}
|
||||
|
||||
client := m.Client
|
||||
if client == nil {
|
||||
client = &acme.Client{DirectoryURL: acme.LetsEncryptURL}
|
||||
}
|
||||
if client.Key == nil {
|
||||
var err error
|
||||
client.Key, err = m.accountKey(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
var contact []string
|
||||
if m.Email != "" {
|
||||
contact = []string{"mailto:" + m.Email}
|
||||
}
|
||||
a := &acme.Account{Contact: contact}
|
||||
_, err := client.Register(ctx, a, m.Prompt)
|
||||
if ae, ok := err.(*acme.Error); err == nil || ok && ae.StatusCode == http.StatusConflict {
|
||||
// conflict indicates the key is already registered
|
||||
m.client = client
|
||||
err = nil
|
||||
}
|
||||
return m.client, err
|
||||
}
|
||||
|
||||
func (m *Manager) hostPolicy() HostPolicy {
|
||||
if m.HostPolicy != nil {
|
||||
return m.HostPolicy
|
||||
}
|
||||
return defaultHostPolicy
|
||||
}
|
||||
|
||||
func (m *Manager) renewBefore() time.Duration {
|
||||
if m.RenewBefore > maxRandRenew {
|
||||
return m.RenewBefore
|
||||
}
|
||||
return 7 * 24 * time.Hour // 1 week
|
||||
}
|
||||
|
||||
// certState is ready when its mutex is unlocked for reading.
|
||||
type certState struct {
|
||||
sync.RWMutex
|
||||
locked bool // locked for read/write
|
||||
key crypto.Signer // private key for cert
|
||||
cert [][]byte // DER encoding
|
||||
leaf *x509.Certificate // parsed cert[0]; always non-nil if cert != nil
|
||||
}
|
||||
|
||||
// tlscert creates a tls.Certificate from s.key and s.cert.
|
||||
// Callers should wrap it in s.RLock() and s.RUnlock().
|
||||
func (s *certState) tlscert() (*tls.Certificate, error) {
|
||||
if s.key == nil {
|
||||
return nil, errors.New("acme/autocert: missing signer")
|
||||
}
|
||||
if len(s.cert) == 0 {
|
||||
return nil, errors.New("acme/autocert: missing certificate")
|
||||
}
|
||||
return &tls.Certificate{
|
||||
PrivateKey: s.key,
|
||||
Certificate: s.cert,
|
||||
Leaf: s.leaf,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// certRequest creates a certificate request for the given common name cn
|
||||
// and optional SANs.
|
||||
func certRequest(key crypto.Signer, cn string, san ...string) ([]byte, error) {
|
||||
req := &x509.CertificateRequest{
|
||||
Subject: pkix.Name{CommonName: cn},
|
||||
DNSNames: san,
|
||||
}
|
||||
return x509.CreateCertificateRequest(rand.Reader, req, key)
|
||||
}
|
||||
|
||||
// Attempt to parse the given private key DER block. OpenSSL 0.9.8 generates
|
||||
// PKCS#1 private keys by default, while OpenSSL 1.0.0 generates PKCS#8 keys.
|
||||
// OpenSSL ecparam generates SEC1 EC private keys for ECDSA. We try all three.
|
||||
//
|
||||
// Inspired by parsePrivateKey in crypto/tls/tls.go.
|
||||
func parsePrivateKey(der []byte) (crypto.Signer, error) {
|
||||
if key, err := x509.ParsePKCS1PrivateKey(der); err == nil {
|
||||
return key, nil
|
||||
}
|
||||
if key, err := x509.ParsePKCS8PrivateKey(der); err == nil {
|
||||
switch key := key.(type) {
|
||||
case *rsa.PrivateKey:
|
||||
return key, nil
|
||||
case *ecdsa.PrivateKey:
|
||||
return key, nil
|
||||
default:
|
||||
return nil, errors.New("acme/autocert: unknown private key type in PKCS#8 wrapping")
|
||||
}
|
||||
}
|
||||
if key, err := x509.ParseECPrivateKey(der); err == nil {
|
||||
return key, nil
|
||||
}
|
||||
|
||||
return nil, errors.New("acme/autocert: failed to parse private key")
|
||||
}
|
||||
|
||||
// validCert parses a cert chain provided as der argument and verifies the leaf, der[0],
|
||||
// corresponds to the private key, as well as the domain match and expiration dates.
|
||||
// It doesn't do any revocation checking.
|
||||
//
|
||||
// The returned value is the verified leaf cert.
|
||||
func validCert(domain string, der [][]byte, key crypto.Signer) (leaf *x509.Certificate, err error) {
|
||||
// parse public part(s)
|
||||
var n int
|
||||
for _, b := range der {
|
||||
n += len(b)
|
||||
}
|
||||
pub := make([]byte, n)
|
||||
n = 0
|
||||
for _, b := range der {
|
||||
n += copy(pub[n:], b)
|
||||
}
|
||||
x509Cert, err := x509.ParseCertificates(pub)
|
||||
if len(x509Cert) == 0 {
|
||||
return nil, errors.New("acme/autocert: no public key found")
|
||||
}
|
||||
// verify the leaf is not expired and matches the domain name
|
||||
leaf = x509Cert[0]
|
||||
now := timeNow()
|
||||
if now.Before(leaf.NotBefore) {
|
||||
return nil, errors.New("acme/autocert: certificate is not valid yet")
|
||||
}
|
||||
if now.After(leaf.NotAfter) {
|
||||
return nil, errors.New("acme/autocert: expired certificate")
|
||||
}
|
||||
if err := leaf.VerifyHostname(domain); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// ensure the leaf corresponds to the private key
|
||||
switch pub := leaf.PublicKey.(type) {
|
||||
case *rsa.PublicKey:
|
||||
prv, ok := key.(*rsa.PrivateKey)
|
||||
if !ok {
|
||||
return nil, errors.New("acme/autocert: private key type does not match public key type")
|
||||
}
|
||||
if pub.N.Cmp(prv.N) != 0 {
|
||||
return nil, errors.New("acme/autocert: private key does not match public key")
|
||||
}
|
||||
case *ecdsa.PublicKey:
|
||||
prv, ok := key.(*ecdsa.PrivateKey)
|
||||
if !ok {
|
||||
return nil, errors.New("acme/autocert: private key type does not match public key type")
|
||||
}
|
||||
if pub.X.Cmp(prv.X) != 0 || pub.Y.Cmp(prv.Y) != 0 {
|
||||
return nil, errors.New("acme/autocert: private key does not match public key")
|
||||
}
|
||||
default:
|
||||
return nil, errors.New("acme/autocert: unknown public key algorithm")
|
||||
}
|
||||
return leaf, nil
|
||||
}
|
||||
|
||||
func retryAfter(v string) time.Duration {
|
||||
if i, err := strconv.Atoi(v); err == nil {
|
||||
return time.Duration(i) * time.Second
|
||||
}
|
||||
if t, err := http.ParseTime(v); err == nil {
|
||||
return t.Sub(timeNow())
|
||||
}
|
||||
return time.Second
|
||||
}
|
||||
|
||||
type lockedMathRand struct {
|
||||
sync.Mutex
|
||||
rnd *mathrand.Rand
|
||||
}
|
||||
|
||||
func (r *lockedMathRand) int63n(max int64) int64 {
|
||||
r.Lock()
|
||||
n := r.rnd.Int63n(max)
|
||||
r.Unlock()
|
||||
return n
|
||||
}
|
||||
|
||||
// for easier testing
|
||||
var timeNow = time.Now
|
||||
+130
@@ -0,0 +1,130 @@
|
||||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package autocert
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"golang.org/x/net/context"
|
||||
)
|
||||
|
||||
// ErrCacheMiss is returned when a certificate is not found in cache.
|
||||
var ErrCacheMiss = errors.New("acme/autocert: certificate cache miss")
|
||||
|
||||
// Cache is used by Manager to store and retrieve previously obtained certificates
|
||||
// as opaque data.
|
||||
//
|
||||
// The key argument of the methods refers to a domain name but need not be an FQDN.
|
||||
// Cache implementations should not rely on the key naming pattern.
|
||||
type Cache interface {
|
||||
// Get returns a certificate data for the specified key.
|
||||
// If there's no such key, Get returns ErrCacheMiss.
|
||||
Get(ctx context.Context, key string) ([]byte, error)
|
||||
|
||||
// Put stores the data in the cache under the specified key.
|
||||
// Underlying implementations may use any data storage format,
|
||||
// as long as the reverse operation, Get, results in the original data.
|
||||
Put(ctx context.Context, key string, data []byte) error
|
||||
|
||||
// Delete removes a certificate data from the cache under the specified key.
|
||||
// If there's no such key in the cache, Delete returns nil.
|
||||
Delete(ctx context.Context, key string) error
|
||||
}
|
||||
|
||||
// DirCache implements Cache using a directory on the local filesystem.
|
||||
// If the directory does not exist, it will be created with 0700 permissions.
|
||||
type DirCache string
|
||||
|
||||
// Get reads a certificate data from the specified file name.
|
||||
func (d DirCache) Get(ctx context.Context, name string) ([]byte, error) {
|
||||
name = filepath.Join(string(d), name)
|
||||
var (
|
||||
data []byte
|
||||
err error
|
||||
done = make(chan struct{})
|
||||
)
|
||||
go func() {
|
||||
data, err = ioutil.ReadFile(name)
|
||||
close(done)
|
||||
}()
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
case <-done:
|
||||
}
|
||||
if os.IsNotExist(err) {
|
||||
return nil, ErrCacheMiss
|
||||
}
|
||||
return data, err
|
||||
}
|
||||
|
||||
// Put writes the certificate data to the specified file name.
|
||||
// The file will be created with 0600 permissions.
|
||||
func (d DirCache) Put(ctx context.Context, name string, data []byte) error {
|
||||
if err := os.MkdirAll(string(d), 0700); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
done := make(chan struct{})
|
||||
var err error
|
||||
go func() {
|
||||
defer close(done)
|
||||
var tmp string
|
||||
if tmp, err = d.writeTempFile(name, data); err != nil {
|
||||
return
|
||||
}
|
||||
// prevent overwriting the file if the context was cancelled
|
||||
if ctx.Err() != nil {
|
||||
return // no need to set err
|
||||
}
|
||||
name = filepath.Join(string(d), name)
|
||||
err = os.Rename(tmp, name)
|
||||
}()
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-done:
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
// Delete removes the specified file name.
|
||||
func (d DirCache) Delete(ctx context.Context, name string) error {
|
||||
name = filepath.Join(string(d), name)
|
||||
var (
|
||||
err error
|
||||
done = make(chan struct{})
|
||||
)
|
||||
go func() {
|
||||
err = os.Remove(name)
|
||||
close(done)
|
||||
}()
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-done:
|
||||
}
|
||||
if err != nil && !os.IsNotExist(err) {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// writeTempFile writes b to a temporary file, closes the file and returns its path.
|
||||
func (d DirCache) writeTempFile(prefix string, b []byte) (string, error) {
|
||||
// TempFile uses 0600 permissions
|
||||
f, err := ioutil.TempFile(string(d), prefix)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if _, err := f.Write(b); err != nil {
|
||||
f.Close()
|
||||
return "", err
|
||||
}
|
||||
return f.Name(), f.Close()
|
||||
}
|
||||
+125
@@ -0,0 +1,125 @@
|
||||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package autocert
|
||||
|
||||
import (
|
||||
"crypto"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"golang.org/x/net/context"
|
||||
)
|
||||
|
||||
// maxRandRenew is a maximum deviation from Manager.RenewBefore.
|
||||
const maxRandRenew = time.Hour
|
||||
|
||||
// domainRenewal tracks the state used by the periodic timers
|
||||
// renewing a single domain's cert.
|
||||
type domainRenewal struct {
|
||||
m *Manager
|
||||
domain string
|
||||
key crypto.Signer
|
||||
|
||||
timerMu sync.Mutex
|
||||
timer *time.Timer
|
||||
}
|
||||
|
||||
// start starts a cert renewal timer at the time
|
||||
// defined by the certificate expiration time exp.
|
||||
//
|
||||
// If the timer is already started, calling start is a noop.
|
||||
func (dr *domainRenewal) start(exp time.Time) {
|
||||
dr.timerMu.Lock()
|
||||
defer dr.timerMu.Unlock()
|
||||
if dr.timer != nil {
|
||||
return
|
||||
}
|
||||
dr.timer = time.AfterFunc(dr.next(exp), dr.renew)
|
||||
}
|
||||
|
||||
// stop stops the cert renewal timer.
|
||||
// If the timer is already stopped, calling stop is a noop.
|
||||
func (dr *domainRenewal) stop() {
|
||||
dr.timerMu.Lock()
|
||||
defer dr.timerMu.Unlock()
|
||||
if dr.timer == nil {
|
||||
return
|
||||
}
|
||||
dr.timer.Stop()
|
||||
dr.timer = nil
|
||||
}
|
||||
|
||||
// renew is called periodically by a timer.
|
||||
// The first renew call is kicked off by dr.start.
|
||||
func (dr *domainRenewal) renew() {
|
||||
dr.timerMu.Lock()
|
||||
defer dr.timerMu.Unlock()
|
||||
if dr.timer == nil {
|
||||
return
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
|
||||
defer cancel()
|
||||
// TODO: rotate dr.key at some point?
|
||||
next, err := dr.do(ctx)
|
||||
if err != nil {
|
||||
next = maxRandRenew / 2
|
||||
next += time.Duration(pseudoRand.int63n(int64(next)))
|
||||
}
|
||||
dr.timer = time.AfterFunc(next, dr.renew)
|
||||
testDidRenewLoop(next, err)
|
||||
}
|
||||
|
||||
// do is similar to Manager.createCert but it doesn't lock a Manager.state item.
|
||||
// Instead, it requests a new certificate independently and, upon success,
|
||||
// replaces dr.m.state item with a new one and updates cache for the given domain.
|
||||
//
|
||||
// It may return immediately if the expiration date of the currently cached cert
|
||||
// is far enough in the future.
|
||||
//
|
||||
// The returned value is a time interval after which the renewal should occur again.
|
||||
func (dr *domainRenewal) do(ctx context.Context) (time.Duration, error) {
|
||||
// a race is likely unavoidable in a distributed environment
|
||||
// but we try nonetheless
|
||||
if tlscert, err := dr.m.cacheGet(dr.domain); err == nil {
|
||||
next := dr.next(tlscert.Leaf.NotAfter)
|
||||
if next > dr.m.renewBefore()+maxRandRenew {
|
||||
return next, nil
|
||||
}
|
||||
}
|
||||
|
||||
der, leaf, err := dr.m.authorizedCert(ctx, dr.key, dr.domain)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
state := &certState{
|
||||
key: dr.key,
|
||||
cert: der,
|
||||
leaf: leaf,
|
||||
}
|
||||
tlscert, err := state.tlscert()
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
dr.m.cachePut(dr.domain, tlscert)
|
||||
dr.m.stateMu.Lock()
|
||||
defer dr.m.stateMu.Unlock()
|
||||
// m.state is guaranteed to be non-nil at this point
|
||||
dr.m.state[dr.domain] = state
|
||||
return dr.next(leaf.NotAfter), nil
|
||||
}
|
||||
|
||||
func (dr *domainRenewal) next(expiry time.Time) time.Duration {
|
||||
d := expiry.Sub(timeNow()) - dr.m.renewBefore()
|
||||
// add a bit of randomness to renew deadline
|
||||
n := pseudoRand.int63n(int64(maxRandRenew))
|
||||
d -= time.Duration(n)
|
||||
if d < 0 {
|
||||
return 0
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
var testDidRenewLoop = func(next time.Duration, err error) {}
|
||||
+153
@@ -0,0 +1,153 @@
|
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package acme
|
||||
|
||||
import (
|
||||
"crypto"
|
||||
"crypto/ecdsa"
|
||||
"crypto/rand"
|
||||
"crypto/rsa"
|
||||
"crypto/sha256"
|
||||
_ "crypto/sha512" // need for EC keys
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math/big"
|
||||
)
|
||||
|
||||
// jwsEncodeJSON signs claimset using provided key and a nonce.
|
||||
// The result is serialized in JSON format.
|
||||
// See https://tools.ietf.org/html/rfc7515#section-7.
|
||||
func jwsEncodeJSON(claimset interface{}, key crypto.Signer, nonce string) ([]byte, error) {
|
||||
jwk, err := jwkEncode(key.Public())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
alg, sha := jwsHasher(key)
|
||||
if alg == "" || !sha.Available() {
|
||||
return nil, ErrUnsupportedKey
|
||||
}
|
||||
phead := fmt.Sprintf(`{"alg":%q,"jwk":%s,"nonce":%q}`, alg, jwk, nonce)
|
||||
phead = base64.RawURLEncoding.EncodeToString([]byte(phead))
|
||||
cs, err := json.Marshal(claimset)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
payload := base64.RawURLEncoding.EncodeToString(cs)
|
||||
hash := sha.New()
|
||||
hash.Write([]byte(phead + "." + payload))
|
||||
sig, err := jwsSign(key, sha, hash.Sum(nil))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
enc := struct {
|
||||
Protected string `json:"protected"`
|
||||
Payload string `json:"payload"`
|
||||
Sig string `json:"signature"`
|
||||
}{
|
||||
Protected: phead,
|
||||
Payload: payload,
|
||||
Sig: base64.RawURLEncoding.EncodeToString(sig),
|
||||
}
|
||||
return json.Marshal(&enc)
|
||||
}
|
||||
|
||||
// jwkEncode encodes public part of an RSA or ECDSA key into a JWK.
|
||||
// The result is also suitable for creating a JWK thumbprint.
|
||||
// https://tools.ietf.org/html/rfc7517
|
||||
func jwkEncode(pub crypto.PublicKey) (string, error) {
|
||||
switch pub := pub.(type) {
|
||||
case *rsa.PublicKey:
|
||||
// https://tools.ietf.org/html/rfc7518#section-6.3.1
|
||||
n := pub.N
|
||||
e := big.NewInt(int64(pub.E))
|
||||
// Field order is important.
|
||||
// See https://tools.ietf.org/html/rfc7638#section-3.3 for details.
|
||||
return fmt.Sprintf(`{"e":"%s","kty":"RSA","n":"%s"}`,
|
||||
base64.RawURLEncoding.EncodeToString(e.Bytes()),
|
||||
base64.RawURLEncoding.EncodeToString(n.Bytes()),
|
||||
), nil
|
||||
case *ecdsa.PublicKey:
|
||||
// https://tools.ietf.org/html/rfc7518#section-6.2.1
|
||||
p := pub.Curve.Params()
|
||||
n := p.BitSize / 8
|
||||
if p.BitSize%8 != 0 {
|
||||
n++
|
||||
}
|
||||
x := pub.X.Bytes()
|
||||
if n > len(x) {
|
||||
x = append(make([]byte, n-len(x)), x...)
|
||||
}
|
||||
y := pub.Y.Bytes()
|
||||
if n > len(y) {
|
||||
y = append(make([]byte, n-len(y)), y...)
|
||||
}
|
||||
// Field order is important.
|
||||
// See https://tools.ietf.org/html/rfc7638#section-3.3 for details.
|
||||
return fmt.Sprintf(`{"crv":"%s","kty":"EC","x":"%s","y":"%s"}`,
|
||||
p.Name,
|
||||
base64.RawURLEncoding.EncodeToString(x),
|
||||
base64.RawURLEncoding.EncodeToString(y),
|
||||
), nil
|
||||
}
|
||||
return "", ErrUnsupportedKey
|
||||
}
|
||||
|
||||
// jwsSign signs the digest using the given key.
|
||||
// It returns ErrUnsupportedKey if the key type is unknown.
|
||||
// The hash is used only for RSA keys.
|
||||
func jwsSign(key crypto.Signer, hash crypto.Hash, digest []byte) ([]byte, error) {
|
||||
switch key := key.(type) {
|
||||
case *rsa.PrivateKey:
|
||||
return key.Sign(rand.Reader, digest, hash)
|
||||
case *ecdsa.PrivateKey:
|
||||
r, s, err := ecdsa.Sign(rand.Reader, key, digest)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rb, sb := r.Bytes(), s.Bytes()
|
||||
size := key.Params().BitSize / 8
|
||||
if size%8 > 0 {
|
||||
size++
|
||||
}
|
||||
sig := make([]byte, size*2)
|
||||
copy(sig[size-len(rb):], rb)
|
||||
copy(sig[size*2-len(sb):], sb)
|
||||
return sig, nil
|
||||
}
|
||||
return nil, ErrUnsupportedKey
|
||||
}
|
||||
|
||||
// jwsHasher indicates suitable JWS algorithm name and a hash function
|
||||
// to use for signing a digest with the provided key.
|
||||
// It returns ("", 0) if the key is not supported.
|
||||
func jwsHasher(key crypto.Signer) (string, crypto.Hash) {
|
||||
switch key := key.(type) {
|
||||
case *rsa.PrivateKey:
|
||||
return "RS256", crypto.SHA256
|
||||
case *ecdsa.PrivateKey:
|
||||
switch key.Params().Name {
|
||||
case "P-256":
|
||||
return "ES256", crypto.SHA256
|
||||
case "P-384":
|
||||
return "ES384", crypto.SHA384
|
||||
case "P-512":
|
||||
return "ES512", crypto.SHA512
|
||||
}
|
||||
}
|
||||
return "", 0
|
||||
}
|
||||
|
||||
// JWKThumbprint creates a JWK thumbprint out of pub
|
||||
// as specified in https://tools.ietf.org/html/rfc7638.
|
||||
func JWKThumbprint(pub crypto.PublicKey) (string, error) {
|
||||
jwk, err := jwkEncode(pub)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
b := sha256.Sum256([]byte(jwk))
|
||||
return base64.RawURLEncoding.EncodeToString(b[:]), nil
|
||||
}
|
||||
+209
@@ -0,0 +1,209 @@
|
||||
package acme
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
// ACME server response statuses used to describe Authorization and Challenge states.
|
||||
const (
|
||||
StatusUnknown = "unknown"
|
||||
StatusPending = "pending"
|
||||
StatusProcessing = "processing"
|
||||
StatusValid = "valid"
|
||||
StatusInvalid = "invalid"
|
||||
StatusRevoked = "revoked"
|
||||
)
|
||||
|
||||
// CRLReasonCode identifies the reason for a certificate revocation.
|
||||
type CRLReasonCode int
|
||||
|
||||
// CRL reason codes as defined in RFC 5280.
|
||||
const (
|
||||
CRLReasonUnspecified CRLReasonCode = 0
|
||||
CRLReasonKeyCompromise CRLReasonCode = 1
|
||||
CRLReasonCACompromise CRLReasonCode = 2
|
||||
CRLReasonAffiliationChanged CRLReasonCode = 3
|
||||
CRLReasonSuperseded CRLReasonCode = 4
|
||||
CRLReasonCessationOfOperation CRLReasonCode = 5
|
||||
CRLReasonCertificateHold CRLReasonCode = 6
|
||||
CRLReasonRemoveFromCRL CRLReasonCode = 8
|
||||
CRLReasonPrivilegeWithdrawn CRLReasonCode = 9
|
||||
CRLReasonAACompromise CRLReasonCode = 10
|
||||
)
|
||||
|
||||
var (
|
||||
// ErrAuthorizationFailed indicates that an authorization for an identifier
|
||||
// did not succeed.
|
||||
ErrAuthorizationFailed = errors.New("acme: identifier authorization failed")
|
||||
|
||||
// ErrUnsupportedKey is returned when an unsupported key type is encountered.
|
||||
ErrUnsupportedKey = errors.New("acme: unknown key type; only RSA and ECDSA are supported")
|
||||
)
|
||||
|
||||
// Error is an ACME error, defined in Problem Details for HTTP APIs doc
|
||||
// http://tools.ietf.org/html/draft-ietf-appsawg-http-problem.
|
||||
type Error struct {
|
||||
// StatusCode is The HTTP status code generated by the origin server.
|
||||
StatusCode int
|
||||
// ProblemType is a URI reference that identifies the problem type,
|
||||
// typically in a "urn:acme:error:xxx" form.
|
||||
ProblemType string
|
||||
// Detail is a human-readable explanation specific to this occurrence of the problem.
|
||||
Detail string
|
||||
// Header is the original server error response headers.
|
||||
Header http.Header
|
||||
}
|
||||
|
||||
func (e *Error) Error() string {
|
||||
return fmt.Sprintf("%d %s: %s", e.StatusCode, e.ProblemType, e.Detail)
|
||||
}
|
||||
|
||||
// Account is a user account. It is associated with a private key.
|
||||
type Account struct {
|
||||
// URI is the account unique ID, which is also a URL used to retrieve
|
||||
// account data from the CA.
|
||||
URI string
|
||||
|
||||
// Contact is a slice of contact info used during registration.
|
||||
Contact []string
|
||||
|
||||
// The terms user has agreed to.
|
||||
// A value not matching CurrentTerms indicates that the user hasn't agreed
|
||||
// to the actual Terms of Service of the CA.
|
||||
AgreedTerms string
|
||||
|
||||
// Actual terms of a CA.
|
||||
CurrentTerms string
|
||||
|
||||
// Authz is the authorization URL used to initiate a new authz flow.
|
||||
Authz string
|
||||
|
||||
// Authorizations is a URI from which a list of authorizations
|
||||
// granted to this account can be fetched via a GET request.
|
||||
Authorizations string
|
||||
|
||||
// Certificates is a URI from which a list of certificates
|
||||
// issued for this account can be fetched via a GET request.
|
||||
Certificates string
|
||||
}
|
||||
|
||||
// Directory is ACME server discovery data.
|
||||
type Directory struct {
|
||||
// RegURL is an account endpoint URL, allowing for creating new
|
||||
// and modifying existing accounts.
|
||||
RegURL string
|
||||
|
||||
// AuthzURL is used to initiate Identifier Authorization flow.
|
||||
AuthzURL string
|
||||
|
||||
// CertURL is a new certificate issuance endpoint URL.
|
||||
CertURL string
|
||||
|
||||
// RevokeURL is used to initiate a certificate revocation flow.
|
||||
RevokeURL string
|
||||
|
||||
// Term is a URI identifying the current terms of service.
|
||||
Terms string
|
||||
|
||||
// Website is an HTTP or HTTPS URL locating a website
|
||||
// providing more information about the ACME server.
|
||||
Website string
|
||||
|
||||
// CAA consists of lowercase hostname elements, which the ACME server
|
||||
// recognises as referring to itself for the purposes of CAA record validation
|
||||
// as defined in RFC6844.
|
||||
CAA []string
|
||||
}
|
||||
|
||||
// Challenge encodes a returned CA challenge.
|
||||
type Challenge struct {
|
||||
// Type is the challenge type, e.g. "http-01", "tls-sni-02", "dns-01".
|
||||
Type string
|
||||
|
||||
// URI is where a challenge response can be posted to.
|
||||
URI string
|
||||
|
||||
// Token is a random value that uniquely identifies the challenge.
|
||||
Token string
|
||||
|
||||
// Status identifies the status of this challenge.
|
||||
Status string
|
||||
}
|
||||
|
||||
// Authorization encodes an authorization response.
|
||||
type Authorization struct {
|
||||
// URI uniquely identifies a authorization.
|
||||
URI string
|
||||
|
||||
// Status identifies the status of an authorization.
|
||||
Status string
|
||||
|
||||
// Identifier is what the account is authorized to represent.
|
||||
Identifier AuthzID
|
||||
|
||||
// Challenges that the client needs to fulfill in order to prove possession
|
||||
// of the identifier (for pending authorizations).
|
||||
// For final authorizations, the challenges that were used.
|
||||
Challenges []*Challenge
|
||||
|
||||
// A collection of sets of challenges, each of which would be sufficient
|
||||
// to prove possession of the identifier.
|
||||
// Clients must complete a set of challenges that covers at least one set.
|
||||
// Challenges are identified by their indices in the challenges array.
|
||||
// If this field is empty, the client needs to complete all challenges.
|
||||
Combinations [][]int
|
||||
}
|
||||
|
||||
// AuthzID is an identifier that an account is authorized to represent.
|
||||
type AuthzID struct {
|
||||
Type string // The type of identifier, e.g. "dns".
|
||||
Value string // The identifier itself, e.g. "example.org".
|
||||
}
|
||||
|
||||
// wireAuthz is ACME JSON representation of Authorization objects.
|
||||
type wireAuthz struct {
|
||||
Status string
|
||||
Challenges []wireChallenge
|
||||
Combinations [][]int
|
||||
Identifier struct {
|
||||
Type string
|
||||
Value string
|
||||
}
|
||||
}
|
||||
|
||||
func (z *wireAuthz) authorization(uri string) *Authorization {
|
||||
a := &Authorization{
|
||||
URI: uri,
|
||||
Status: z.Status,
|
||||
Identifier: AuthzID{Type: z.Identifier.Type, Value: z.Identifier.Value},
|
||||
Combinations: z.Combinations, // shallow copy
|
||||
Challenges: make([]*Challenge, len(z.Challenges)),
|
||||
}
|
||||
for i, v := range z.Challenges {
|
||||
a.Challenges[i] = v.challenge()
|
||||
}
|
||||
return a
|
||||
}
|
||||
|
||||
// wireChallenge is ACME JSON challenge representation.
|
||||
type wireChallenge struct {
|
||||
URI string `json:"uri"`
|
||||
Type string
|
||||
Token string
|
||||
Status string
|
||||
}
|
||||
|
||||
func (c *wireChallenge) challenge() *Challenge {
|
||||
v := &Challenge{
|
||||
URI: c.URI,
|
||||
Type: c.Type,
|
||||
Token: c.Token,
|
||||
Status: c.Status,
|
||||
}
|
||||
if v.Status == "" {
|
||||
v.Status = StatusPending
|
||||
}
|
||||
return v
|
||||
}
|
||||
+27
@@ -0,0 +1,27 @@
|
||||
Copyright (c) 2009 The Go Authors. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Google Inc. nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
+156
@@ -0,0 +1,156 @@
|
||||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package context defines the Context type, which carries deadlines,
|
||||
// cancelation signals, and other request-scoped values across API boundaries
|
||||
// and between processes.
|
||||
//
|
||||
// Incoming requests to a server should create a Context, and outgoing calls to
|
||||
// servers should accept a Context. The chain of function calls between must
|
||||
// propagate the Context, optionally replacing it with a modified copy created
|
||||
// using WithDeadline, WithTimeout, WithCancel, or WithValue.
|
||||
//
|
||||
// Programs that use Contexts should follow these rules to keep interfaces
|
||||
// consistent across packages and enable static analysis tools to check context
|
||||
// propagation:
|
||||
//
|
||||
// Do not store Contexts inside a struct type; instead, pass a Context
|
||||
// explicitly to each function that needs it. The Context should be the first
|
||||
// parameter, typically named ctx:
|
||||
//
|
||||
// func DoSomething(ctx context.Context, arg Arg) error {
|
||||
// // ... use ctx ...
|
||||
// }
|
||||
//
|
||||
// Do not pass a nil Context, even if a function permits it. Pass context.TODO
|
||||
// if you are unsure about which Context to use.
|
||||
//
|
||||
// Use context Values only for request-scoped data that transits processes and
|
||||
// APIs, not for passing optional parameters to functions.
|
||||
//
|
||||
// The same Context may be passed to functions running in different goroutines;
|
||||
// Contexts are safe for simultaneous use by multiple goroutines.
|
||||
//
|
||||
// See http://blog.golang.org/context for example code for a server that uses
|
||||
// Contexts.
|
||||
package context // import "golang.org/x/net/context"
|
||||
|
||||
import "time"
|
||||
|
||||
// A Context carries a deadline, a cancelation signal, and other values across
|
||||
// API boundaries.
|
||||
//
|
||||
// Context's methods may be called by multiple goroutines simultaneously.
|
||||
type Context interface {
|
||||
// Deadline returns the time when work done on behalf of this context
|
||||
// should be canceled. Deadline returns ok==false when no deadline is
|
||||
// set. Successive calls to Deadline return the same results.
|
||||
Deadline() (deadline time.Time, ok bool)
|
||||
|
||||
// Done returns a channel that's closed when work done on behalf of this
|
||||
// context should be canceled. Done may return nil if this context can
|
||||
// never be canceled. Successive calls to Done return the same value.
|
||||
//
|
||||
// WithCancel arranges for Done to be closed when cancel is called;
|
||||
// WithDeadline arranges for Done to be closed when the deadline
|
||||
// expires; WithTimeout arranges for Done to be closed when the timeout
|
||||
// elapses.
|
||||
//
|
||||
// Done is provided for use in select statements:
|
||||
//
|
||||
// // Stream generates values with DoSomething and sends them to out
|
||||
// // until DoSomething returns an error or ctx.Done is closed.
|
||||
// func Stream(ctx context.Context, out chan<- Value) error {
|
||||
// for {
|
||||
// v, err := DoSomething(ctx)
|
||||
// if err != nil {
|
||||
// return err
|
||||
// }
|
||||
// select {
|
||||
// case <-ctx.Done():
|
||||
// return ctx.Err()
|
||||
// case out <- v:
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// See http://blog.golang.org/pipelines for more examples of how to use
|
||||
// a Done channel for cancelation.
|
||||
Done() <-chan struct{}
|
||||
|
||||
// Err returns a non-nil error value after Done is closed. Err returns
|
||||
// Canceled if the context was canceled or DeadlineExceeded if the
|
||||
// context's deadline passed. No other values for Err are defined.
|
||||
// After Done is closed, successive calls to Err return the same value.
|
||||
Err() error
|
||||
|
||||
// Value returns the value associated with this context for key, or nil
|
||||
// if no value is associated with key. Successive calls to Value with
|
||||
// the same key returns the same result.
|
||||
//
|
||||
// Use context values only for request-scoped data that transits
|
||||
// processes and API boundaries, not for passing optional parameters to
|
||||
// functions.
|
||||
//
|
||||
// A key identifies a specific value in a Context. Functions that wish
|
||||
// to store values in Context typically allocate a key in a global
|
||||
// variable then use that key as the argument to context.WithValue and
|
||||
// Context.Value. A key can be any type that supports equality;
|
||||
// packages should define keys as an unexported type to avoid
|
||||
// collisions.
|
||||
//
|
||||
// Packages that define a Context key should provide type-safe accessors
|
||||
// for the values stores using that key:
|
||||
//
|
||||
// // Package user defines a User type that's stored in Contexts.
|
||||
// package user
|
||||
//
|
||||
// import "golang.org/x/net/context"
|
||||
//
|
||||
// // User is the type of value stored in the Contexts.
|
||||
// type User struct {...}
|
||||
//
|
||||
// // key is an unexported type for keys defined in this package.
|
||||
// // This prevents collisions with keys defined in other packages.
|
||||
// type key int
|
||||
//
|
||||
// // userKey is the key for user.User values in Contexts. It is
|
||||
// // unexported; clients use user.NewContext and user.FromContext
|
||||
// // instead of using this key directly.
|
||||
// var userKey key = 0
|
||||
//
|
||||
// // NewContext returns a new Context that carries value u.
|
||||
// func NewContext(ctx context.Context, u *User) context.Context {
|
||||
// return context.WithValue(ctx, userKey, u)
|
||||
// }
|
||||
//
|
||||
// // FromContext returns the User value stored in ctx, if any.
|
||||
// func FromContext(ctx context.Context) (*User, bool) {
|
||||
// u, ok := ctx.Value(userKey).(*User)
|
||||
// return u, ok
|
||||
// }
|
||||
Value(key interface{}) interface{}
|
||||
}
|
||||
|
||||
// Background returns a non-nil, empty Context. It is never canceled, has no
|
||||
// values, and has no deadline. It is typically used by the main function,
|
||||
// initialization, and tests, and as the top-level Context for incoming
|
||||
// requests.
|
||||
func Background() Context {
|
||||
return background
|
||||
}
|
||||
|
||||
// TODO returns a non-nil, empty Context. Code should use context.TODO when
|
||||
// it's unclear which Context to use or it is not yet available (because the
|
||||
// surrounding function has not yet been extended to accept a Context
|
||||
// parameter). TODO is recognized by static analysis tools that determine
|
||||
// whether Contexts are propagated correctly in a program.
|
||||
func TODO() Context {
|
||||
return todo
|
||||
}
|
||||
|
||||
// A CancelFunc tells an operation to abandon its work.
|
||||
// A CancelFunc does not wait for the work to stop.
|
||||
// After the first call, subsequent calls to a CancelFunc do nothing.
|
||||
type CancelFunc func()
|
||||
+74
@@ -0,0 +1,74 @@
|
||||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build go1.7
|
||||
|
||||
// Package ctxhttp provides helper functions for performing context-aware HTTP requests.
|
||||
package ctxhttp // import "golang.org/x/net/context/ctxhttp"
|
||||
|
||||
import (
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/context"
|
||||
)
|
||||
|
||||
// Do sends an HTTP request with the provided http.Client and returns
|
||||
// an HTTP response.
|
||||
//
|
||||
// If the client is nil, http.DefaultClient is used.
|
||||
//
|
||||
// The provided ctx must be non-nil. If it is canceled or times out,
|
||||
// ctx.Err() will be returned.
|
||||
func Do(ctx context.Context, client *http.Client, req *http.Request) (*http.Response, error) {
|
||||
if client == nil {
|
||||
client = http.DefaultClient
|
||||
}
|
||||
resp, err := client.Do(req.WithContext(ctx))
|
||||
// If we got an error, and the context has been canceled,
|
||||
// the context's error is probably more useful.
|
||||
if err != nil {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
err = ctx.Err()
|
||||
default:
|
||||
}
|
||||
}
|
||||
return resp, err
|
||||
}
|
||||
|
||||
// Get issues a GET request via the Do function.
|
||||
func Get(ctx context.Context, client *http.Client, url string) (*http.Response, error) {
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return Do(ctx, client, req)
|
||||
}
|
||||
|
||||
// Head issues a HEAD request via the Do function.
|
||||
func Head(ctx context.Context, client *http.Client, url string) (*http.Response, error) {
|
||||
req, err := http.NewRequest("HEAD", url, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return Do(ctx, client, req)
|
||||
}
|
||||
|
||||
// Post issues a POST request via the Do function.
|
||||
func Post(ctx context.Context, client *http.Client, url string, bodyType string, body io.Reader) (*http.Response, error) {
|
||||
req, err := http.NewRequest("POST", url, body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.Header.Set("Content-Type", bodyType)
|
||||
return Do(ctx, client, req)
|
||||
}
|
||||
|
||||
// PostForm issues a POST request via the Do function.
|
||||
func PostForm(ctx context.Context, client *http.Client, url string, data url.Values) (*http.Response, error) {
|
||||
return Post(ctx, client, url, "application/x-www-form-urlencoded", strings.NewReader(data.Encode()))
|
||||
}
|
||||
+147
@@ -0,0 +1,147 @@
|
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !go1.7
|
||||
|
||||
package ctxhttp // import "golang.org/x/net/context/ctxhttp"
|
||||
|
||||
import (
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/context"
|
||||
)
|
||||
|
||||
func nop() {}
|
||||
|
||||
var (
|
||||
testHookContextDoneBeforeHeaders = nop
|
||||
testHookDoReturned = nop
|
||||
testHookDidBodyClose = nop
|
||||
)
|
||||
|
||||
// Do sends an HTTP request with the provided http.Client and returns an HTTP response.
|
||||
// If the client is nil, http.DefaultClient is used.
|
||||
// If the context is canceled or times out, ctx.Err() will be returned.
|
||||
func Do(ctx context.Context, client *http.Client, req *http.Request) (*http.Response, error) {
|
||||
if client == nil {
|
||||
client = http.DefaultClient
|
||||
}
|
||||
|
||||
// TODO(djd): Respect any existing value of req.Cancel.
|
||||
cancel := make(chan struct{})
|
||||
req.Cancel = cancel
|
||||
|
||||
type responseAndError struct {
|
||||
resp *http.Response
|
||||
err error
|
||||
}
|
||||
result := make(chan responseAndError, 1)
|
||||
|
||||
// Make local copies of test hooks closed over by goroutines below.
|
||||
// Prevents data races in tests.
|
||||
testHookDoReturned := testHookDoReturned
|
||||
testHookDidBodyClose := testHookDidBodyClose
|
||||
|
||||
go func() {
|
||||
resp, err := client.Do(req)
|
||||
testHookDoReturned()
|
||||
result <- responseAndError{resp, err}
|
||||
}()
|
||||
|
||||
var resp *http.Response
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
testHookContextDoneBeforeHeaders()
|
||||
close(cancel)
|
||||
// Clean up after the goroutine calling client.Do:
|
||||
go func() {
|
||||
if r := <-result; r.resp != nil {
|
||||
testHookDidBodyClose()
|
||||
r.resp.Body.Close()
|
||||
}
|
||||
}()
|
||||
return nil, ctx.Err()
|
||||
case r := <-result:
|
||||
var err error
|
||||
resp, err = r.resp, r.err
|
||||
if err != nil {
|
||||
return resp, err
|
||||
}
|
||||
}
|
||||
|
||||
c := make(chan struct{})
|
||||
go func() {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
close(cancel)
|
||||
case <-c:
|
||||
// The response's Body is closed.
|
||||
}
|
||||
}()
|
||||
resp.Body = ¬ifyingReader{resp.Body, c}
|
||||
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
// Get issues a GET request via the Do function.
|
||||
func Get(ctx context.Context, client *http.Client, url string) (*http.Response, error) {
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return Do(ctx, client, req)
|
||||
}
|
||||
|
||||
// Head issues a HEAD request via the Do function.
|
||||
func Head(ctx context.Context, client *http.Client, url string) (*http.Response, error) {
|
||||
req, err := http.NewRequest("HEAD", url, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return Do(ctx, client, req)
|
||||
}
|
||||
|
||||
// Post issues a POST request via the Do function.
|
||||
func Post(ctx context.Context, client *http.Client, url string, bodyType string, body io.Reader) (*http.Response, error) {
|
||||
req, err := http.NewRequest("POST", url, body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.Header.Set("Content-Type", bodyType)
|
||||
return Do(ctx, client, req)
|
||||
}
|
||||
|
||||
// PostForm issues a POST request via the Do function.
|
||||
func PostForm(ctx context.Context, client *http.Client, url string, data url.Values) (*http.Response, error) {
|
||||
return Post(ctx, client, url, "application/x-www-form-urlencoded", strings.NewReader(data.Encode()))
|
||||
}
|
||||
|
||||
// notifyingReader is an io.ReadCloser that closes the notify channel after
|
||||
// Close is called or a Read fails on the underlying ReadCloser.
|
||||
type notifyingReader struct {
|
||||
io.ReadCloser
|
||||
notify chan<- struct{}
|
||||
}
|
||||
|
||||
func (r *notifyingReader) Read(p []byte) (int, error) {
|
||||
n, err := r.ReadCloser.Read(p)
|
||||
if err != nil && r.notify != nil {
|
||||
close(r.notify)
|
||||
r.notify = nil
|
||||
}
|
||||
return n, err
|
||||
}
|
||||
|
||||
func (r *notifyingReader) Close() error {
|
||||
err := r.ReadCloser.Close()
|
||||
if r.notify != nil {
|
||||
close(r.notify)
|
||||
r.notify = nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
+72
@@ -0,0 +1,72 @@
|
||||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build go1.7
|
||||
|
||||
package context
|
||||
|
||||
import (
|
||||
"context" // standard library's context, as of Go 1.7
|
||||
"time"
|
||||
)
|
||||
|
||||
var (
|
||||
todo = context.TODO()
|
||||
background = context.Background()
|
||||
)
|
||||
|
||||
// Canceled is the error returned by Context.Err when the context is canceled.
|
||||
var Canceled = context.Canceled
|
||||
|
||||
// DeadlineExceeded is the error returned by Context.Err when the context's
|
||||
// deadline passes.
|
||||
var DeadlineExceeded = context.DeadlineExceeded
|
||||
|
||||
// WithCancel returns a copy of parent with a new Done channel. The returned
|
||||
// context's Done channel is closed when the returned cancel function is called
|
||||
// or when the parent context's Done channel is closed, whichever happens first.
|
||||
//
|
||||
// Canceling this context releases resources associated with it, so code should
|
||||
// call cancel as soon as the operations running in this Context complete.
|
||||
func WithCancel(parent Context) (ctx Context, cancel CancelFunc) {
|
||||
ctx, f := context.WithCancel(parent)
|
||||
return ctx, CancelFunc(f)
|
||||
}
|
||||
|
||||
// WithDeadline returns a copy of the parent context with the deadline adjusted
|
||||
// to be no later than d. If the parent's deadline is already earlier than d,
|
||||
// WithDeadline(parent, d) is semantically equivalent to parent. The returned
|
||||
// context's Done channel is closed when the deadline expires, when the returned
|
||||
// cancel function is called, or when the parent context's Done channel is
|
||||
// closed, whichever happens first.
|
||||
//
|
||||
// Canceling this context releases resources associated with it, so code should
|
||||
// call cancel as soon as the operations running in this Context complete.
|
||||
func WithDeadline(parent Context, deadline time.Time) (Context, CancelFunc) {
|
||||
ctx, f := context.WithDeadline(parent, deadline)
|
||||
return ctx, CancelFunc(f)
|
||||
}
|
||||
|
||||
// WithTimeout returns WithDeadline(parent, time.Now().Add(timeout)).
|
||||
//
|
||||
// Canceling this context releases resources associated with it, so code should
|
||||
// call cancel as soon as the operations running in this Context complete:
|
||||
//
|
||||
// func slowOperationWithTimeout(ctx context.Context) (Result, error) {
|
||||
// ctx, cancel := context.WithTimeout(ctx, 100*time.Millisecond)
|
||||
// defer cancel() // releases resources if slowOperation completes before timeout elapses
|
||||
// return slowOperation(ctx)
|
||||
// }
|
||||
func WithTimeout(parent Context, timeout time.Duration) (Context, CancelFunc) {
|
||||
return WithDeadline(parent, time.Now().Add(timeout))
|
||||
}
|
||||
|
||||
// WithValue returns a copy of parent in which the value associated with key is
|
||||
// val.
|
||||
//
|
||||
// Use context Values only for request-scoped data that transits processes and
|
||||
// APIs, not for passing optional parameters to functions.
|
||||
func WithValue(parent Context, key interface{}, val interface{}) Context {
|
||||
return context.WithValue(parent, key, val)
|
||||
}
|
||||
+300
@@ -0,0 +1,300 @@
|
||||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !go1.7
|
||||
|
||||
package context
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// An emptyCtx is never canceled, has no values, and has no deadline. It is not
|
||||
// struct{}, since vars of this type must have distinct addresses.
|
||||
type emptyCtx int
|
||||
|
||||
func (*emptyCtx) Deadline() (deadline time.Time, ok bool) {
|
||||
return
|
||||
}
|
||||
|
||||
func (*emptyCtx) Done() <-chan struct{} {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (*emptyCtx) Err() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (*emptyCtx) Value(key interface{}) interface{} {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (e *emptyCtx) String() string {
|
||||
switch e {
|
||||
case background:
|
||||
return "context.Background"
|
||||
case todo:
|
||||
return "context.TODO"
|
||||
}
|
||||
return "unknown empty Context"
|
||||
}
|
||||
|
||||
var (
|
||||
background = new(emptyCtx)
|
||||
todo = new(emptyCtx)
|
||||
)
|
||||
|
||||
// Canceled is the error returned by Context.Err when the context is canceled.
|
||||
var Canceled = errors.New("context canceled")
|
||||
|
||||
// DeadlineExceeded is the error returned by Context.Err when the context's
|
||||
// deadline passes.
|
||||
var DeadlineExceeded = errors.New("context deadline exceeded")
|
||||
|
||||
// WithCancel returns a copy of parent with a new Done channel. The returned
|
||||
// context's Done channel is closed when the returned cancel function is called
|
||||
// or when the parent context's Done channel is closed, whichever happens first.
|
||||
//
|
||||
// Canceling this context releases resources associated with it, so code should
|
||||
// call cancel as soon as the operations running in this Context complete.
|
||||
func WithCancel(parent Context) (ctx Context, cancel CancelFunc) {
|
||||
c := newCancelCtx(parent)
|
||||
propagateCancel(parent, c)
|
||||
return c, func() { c.cancel(true, Canceled) }
|
||||
}
|
||||
|
||||
// newCancelCtx returns an initialized cancelCtx.
|
||||
func newCancelCtx(parent Context) *cancelCtx {
|
||||
return &cancelCtx{
|
||||
Context: parent,
|
||||
done: make(chan struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
// propagateCancel arranges for child to be canceled when parent is.
|
||||
func propagateCancel(parent Context, child canceler) {
|
||||
if parent.Done() == nil {
|
||||
return // parent is never canceled
|
||||
}
|
||||
if p, ok := parentCancelCtx(parent); ok {
|
||||
p.mu.Lock()
|
||||
if p.err != nil {
|
||||
// parent has already been canceled
|
||||
child.cancel(false, p.err)
|
||||
} else {
|
||||
if p.children == nil {
|
||||
p.children = make(map[canceler]bool)
|
||||
}
|
||||
p.children[child] = true
|
||||
}
|
||||
p.mu.Unlock()
|
||||
} else {
|
||||
go func() {
|
||||
select {
|
||||
case <-parent.Done():
|
||||
child.cancel(false, parent.Err())
|
||||
case <-child.Done():
|
||||
}
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
// parentCancelCtx follows a chain of parent references until it finds a
|
||||
// *cancelCtx. This function understands how each of the concrete types in this
|
||||
// package represents its parent.
|
||||
func parentCancelCtx(parent Context) (*cancelCtx, bool) {
|
||||
for {
|
||||
switch c := parent.(type) {
|
||||
case *cancelCtx:
|
||||
return c, true
|
||||
case *timerCtx:
|
||||
return c.cancelCtx, true
|
||||
case *valueCtx:
|
||||
parent = c.Context
|
||||
default:
|
||||
return nil, false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// removeChild removes a context from its parent.
|
||||
func removeChild(parent Context, child canceler) {
|
||||
p, ok := parentCancelCtx(parent)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
p.mu.Lock()
|
||||
if p.children != nil {
|
||||
delete(p.children, child)
|
||||
}
|
||||
p.mu.Unlock()
|
||||
}
|
||||
|
||||
// A canceler is a context type that can be canceled directly. The
|
||||
// implementations are *cancelCtx and *timerCtx.
|
||||
type canceler interface {
|
||||
cancel(removeFromParent bool, err error)
|
||||
Done() <-chan struct{}
|
||||
}
|
||||
|
||||
// A cancelCtx can be canceled. When canceled, it also cancels any children
|
||||
// that implement canceler.
|
||||
type cancelCtx struct {
|
||||
Context
|
||||
|
||||
done chan struct{} // closed by the first cancel call.
|
||||
|
||||
mu sync.Mutex
|
||||
children map[canceler]bool // set to nil by the first cancel call
|
||||
err error // set to non-nil by the first cancel call
|
||||
}
|
||||
|
||||
func (c *cancelCtx) Done() <-chan struct{} {
|
||||
return c.done
|
||||
}
|
||||
|
||||
func (c *cancelCtx) Err() error {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
return c.err
|
||||
}
|
||||
|
||||
func (c *cancelCtx) String() string {
|
||||
return fmt.Sprintf("%v.WithCancel", c.Context)
|
||||
}
|
||||
|
||||
// cancel closes c.done, cancels each of c's children, and, if
|
||||
// removeFromParent is true, removes c from its parent's children.
|
||||
func (c *cancelCtx) cancel(removeFromParent bool, err error) {
|
||||
if err == nil {
|
||||
panic("context: internal error: missing cancel error")
|
||||
}
|
||||
c.mu.Lock()
|
||||
if c.err != nil {
|
||||
c.mu.Unlock()
|
||||
return // already canceled
|
||||
}
|
||||
c.err = err
|
||||
close(c.done)
|
||||
for child := range c.children {
|
||||
// NOTE: acquiring the child's lock while holding parent's lock.
|
||||
child.cancel(false, err)
|
||||
}
|
||||
c.children = nil
|
||||
c.mu.Unlock()
|
||||
|
||||
if removeFromParent {
|
||||
removeChild(c.Context, c)
|
||||
}
|
||||
}
|
||||
|
||||
// WithDeadline returns a copy of the parent context with the deadline adjusted
|
||||
// to be no later than d. If the parent's deadline is already earlier than d,
|
||||
// WithDeadline(parent, d) is semantically equivalent to parent. The returned
|
||||
// context's Done channel is closed when the deadline expires, when the returned
|
||||
// cancel function is called, or when the parent context's Done channel is
|
||||
// closed, whichever happens first.
|
||||
//
|
||||
// Canceling this context releases resources associated with it, so code should
|
||||
// call cancel as soon as the operations running in this Context complete.
|
||||
func WithDeadline(parent Context, deadline time.Time) (Context, CancelFunc) {
|
||||
if cur, ok := parent.Deadline(); ok && cur.Before(deadline) {
|
||||
// The current deadline is already sooner than the new one.
|
||||
return WithCancel(parent)
|
||||
}
|
||||
c := &timerCtx{
|
||||
cancelCtx: newCancelCtx(parent),
|
||||
deadline: deadline,
|
||||
}
|
||||
propagateCancel(parent, c)
|
||||
d := deadline.Sub(time.Now())
|
||||
if d <= 0 {
|
||||
c.cancel(true, DeadlineExceeded) // deadline has already passed
|
||||
return c, func() { c.cancel(true, Canceled) }
|
||||
}
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
if c.err == nil {
|
||||
c.timer = time.AfterFunc(d, func() {
|
||||
c.cancel(true, DeadlineExceeded)
|
||||
})
|
||||
}
|
||||
return c, func() { c.cancel(true, Canceled) }
|
||||
}
|
||||
|
||||
// A timerCtx carries a timer and a deadline. It embeds a cancelCtx to
|
||||
// implement Done and Err. It implements cancel by stopping its timer then
|
||||
// delegating to cancelCtx.cancel.
|
||||
type timerCtx struct {
|
||||
*cancelCtx
|
||||
timer *time.Timer // Under cancelCtx.mu.
|
||||
|
||||
deadline time.Time
|
||||
}
|
||||
|
||||
func (c *timerCtx) Deadline() (deadline time.Time, ok bool) {
|
||||
return c.deadline, true
|
||||
}
|
||||
|
||||
func (c *timerCtx) String() string {
|
||||
return fmt.Sprintf("%v.WithDeadline(%s [%s])", c.cancelCtx.Context, c.deadline, c.deadline.Sub(time.Now()))
|
||||
}
|
||||
|
||||
func (c *timerCtx) cancel(removeFromParent bool, err error) {
|
||||
c.cancelCtx.cancel(false, err)
|
||||
if removeFromParent {
|
||||
// Remove this timerCtx from its parent cancelCtx's children.
|
||||
removeChild(c.cancelCtx.Context, c)
|
||||
}
|
||||
c.mu.Lock()
|
||||
if c.timer != nil {
|
||||
c.timer.Stop()
|
||||
c.timer = nil
|
||||
}
|
||||
c.mu.Unlock()
|
||||
}
|
||||
|
||||
// WithTimeout returns WithDeadline(parent, time.Now().Add(timeout)).
|
||||
//
|
||||
// Canceling this context releases resources associated with it, so code should
|
||||
// call cancel as soon as the operations running in this Context complete:
|
||||
//
|
||||
// func slowOperationWithTimeout(ctx context.Context) (Result, error) {
|
||||
// ctx, cancel := context.WithTimeout(ctx, 100*time.Millisecond)
|
||||
// defer cancel() // releases resources if slowOperation completes before timeout elapses
|
||||
// return slowOperation(ctx)
|
||||
// }
|
||||
func WithTimeout(parent Context, timeout time.Duration) (Context, CancelFunc) {
|
||||
return WithDeadline(parent, time.Now().Add(timeout))
|
||||
}
|
||||
|
||||
// WithValue returns a copy of parent in which the value associated with key is
|
||||
// val.
|
||||
//
|
||||
// Use context Values only for request-scoped data that transits processes and
|
||||
// APIs, not for passing optional parameters to functions.
|
||||
func WithValue(parent Context, key interface{}, val interface{}) Context {
|
||||
return &valueCtx{parent, key, val}
|
||||
}
|
||||
|
||||
// A valueCtx carries a key-value pair. It implements Value for that key and
|
||||
// delegates all other calls to the embedded Context.
|
||||
type valueCtx struct {
|
||||
Context
|
||||
key, val interface{}
|
||||
}
|
||||
|
||||
func (c *valueCtx) String() string {
|
||||
return fmt.Sprintf("%v.WithValue(%#v, %#v)", c.Context, c.key, c.val)
|
||||
}
|
||||
|
||||
func (c *valueCtx) Value(key interface{}) interface{} {
|
||||
if c.key == key {
|
||||
return c.val
|
||||
}
|
||||
return c.Context.Value(key)
|
||||
}
|
||||
+27
@@ -0,0 +1,27 @@
|
||||
Copyright (c) 2009 The Go Authors. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Google Inc. nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
+78
@@ -0,0 +1,78 @@
|
||||
// Copyright 2012 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package atom provides integer codes (also known as atoms) for a fixed set of
|
||||
// frequently occurring HTML strings: tag names and attribute keys such as "p"
|
||||
// and "id".
|
||||
//
|
||||
// Sharing an atom's name between all elements with the same tag can result in
|
||||
// fewer string allocations when tokenizing and parsing HTML. Integer
|
||||
// comparisons are also generally faster than string comparisons.
|
||||
//
|
||||
// The value of an atom's particular code is not guaranteed to stay the same
|
||||
// between versions of this package. Neither is any ordering guaranteed:
|
||||
// whether atom.H1 < atom.H2 may also change. The codes are not guaranteed to
|
||||
// be dense. The only guarantees are that e.g. looking up "div" will yield
|
||||
// atom.Div, calling atom.Div.String will return "div", and atom.Div != 0.
|
||||
package atom // import "golang.org/x/net/html/atom"
|
||||
|
||||
// Atom is an integer code for a string. The zero value maps to "".
|
||||
type Atom uint32
|
||||
|
||||
// String returns the atom's name.
|
||||
func (a Atom) String() string {
|
||||
start := uint32(a >> 8)
|
||||
n := uint32(a & 0xff)
|
||||
if start+n > uint32(len(atomText)) {
|
||||
return ""
|
||||
}
|
||||
return atomText[start : start+n]
|
||||
}
|
||||
|
||||
func (a Atom) string() string {
|
||||
return atomText[a>>8 : a>>8+a&0xff]
|
||||
}
|
||||
|
||||
// fnv computes the FNV hash with an arbitrary starting value h.
|
||||
func fnv(h uint32, s []byte) uint32 {
|
||||
for i := range s {
|
||||
h ^= uint32(s[i])
|
||||
h *= 16777619
|
||||
}
|
||||
return h
|
||||
}
|
||||
|
||||
func match(s string, t []byte) bool {
|
||||
for i, c := range t {
|
||||
if s[i] != c {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// Lookup returns the atom whose name is s. It returns zero if there is no
|
||||
// such atom. The lookup is case sensitive.
|
||||
func Lookup(s []byte) Atom {
|
||||
if len(s) == 0 || len(s) > maxAtomLen {
|
||||
return 0
|
||||
}
|
||||
h := fnv(hash0, s)
|
||||
if a := table[h&uint32(len(table)-1)]; int(a&0xff) == len(s) && match(a.string(), s) {
|
||||
return a
|
||||
}
|
||||
if a := table[(h>>16)&uint32(len(table)-1)]; int(a&0xff) == len(s) && match(a.string(), s) {
|
||||
return a
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// String returns a string whose contents are equal to s. In that sense, it is
|
||||
// equivalent to string(s) but may be more efficient.
|
||||
func String(s []byte) string {
|
||||
if a := Lookup(s); a != 0 {
|
||||
return a.String()
|
||||
}
|
||||
return string(s)
|
||||
}
|
||||
+648
@@ -0,0 +1,648 @@
|
||||
// Copyright 2012 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
// This program generates table.go and table_test.go.
|
||||
// Invoke as
|
||||
//
|
||||
// go run gen.go |gofmt >table.go
|
||||
// go run gen.go -test |gofmt >table_test.go
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"math/rand"
|
||||
"os"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// identifier converts s to a Go exported identifier.
|
||||
// It converts "div" to "Div" and "accept-charset" to "AcceptCharset".
|
||||
func identifier(s string) string {
|
||||
b := make([]byte, 0, len(s))
|
||||
cap := true
|
||||
for _, c := range s {
|
||||
if c == '-' {
|
||||
cap = true
|
||||
continue
|
||||
}
|
||||
if cap && 'a' <= c && c <= 'z' {
|
||||
c -= 'a' - 'A'
|
||||
}
|
||||
cap = false
|
||||
b = append(b, byte(c))
|
||||
}
|
||||
return string(b)
|
||||
}
|
||||
|
||||
var test = flag.Bool("test", false, "generate table_test.go")
|
||||
|
||||
func main() {
|
||||
flag.Parse()
|
||||
|
||||
var all []string
|
||||
all = append(all, elements...)
|
||||
all = append(all, attributes...)
|
||||
all = append(all, eventHandlers...)
|
||||
all = append(all, extra...)
|
||||
sort.Strings(all)
|
||||
|
||||
if *test {
|
||||
fmt.Printf("// generated by go run gen.go -test; DO NOT EDIT\n\n")
|
||||
fmt.Printf("package atom\n\n")
|
||||
fmt.Printf("var testAtomList = []string{\n")
|
||||
for _, s := range all {
|
||||
fmt.Printf("\t%q,\n", s)
|
||||
}
|
||||
fmt.Printf("}\n")
|
||||
return
|
||||
}
|
||||
|
||||
// uniq - lists have dups
|
||||
// compute max len too
|
||||
maxLen := 0
|
||||
w := 0
|
||||
for _, s := range all {
|
||||
if w == 0 || all[w-1] != s {
|
||||
if maxLen < len(s) {
|
||||
maxLen = len(s)
|
||||
}
|
||||
all[w] = s
|
||||
w++
|
||||
}
|
||||
}
|
||||
all = all[:w]
|
||||
|
||||
// Find hash that minimizes table size.
|
||||
var best *table
|
||||
for i := 0; i < 1000000; i++ {
|
||||
if best != nil && 1<<(best.k-1) < len(all) {
|
||||
break
|
||||
}
|
||||
h := rand.Uint32()
|
||||
for k := uint(0); k <= 16; k++ {
|
||||
if best != nil && k >= best.k {
|
||||
break
|
||||
}
|
||||
var t table
|
||||
if t.init(h, k, all) {
|
||||
best = &t
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if best == nil {
|
||||
fmt.Fprintf(os.Stderr, "failed to construct string table\n")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Lay out strings, using overlaps when possible.
|
||||
layout := append([]string{}, all...)
|
||||
|
||||
// Remove strings that are substrings of other strings
|
||||
for changed := true; changed; {
|
||||
changed = false
|
||||
for i, s := range layout {
|
||||
if s == "" {
|
||||
continue
|
||||
}
|
||||
for j, t := range layout {
|
||||
if i != j && t != "" && strings.Contains(s, t) {
|
||||
changed = true
|
||||
layout[j] = ""
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Join strings where one suffix matches another prefix.
|
||||
for {
|
||||
// Find best i, j, k such that layout[i][len-k:] == layout[j][:k],
|
||||
// maximizing overlap length k.
|
||||
besti := -1
|
||||
bestj := -1
|
||||
bestk := 0
|
||||
for i, s := range layout {
|
||||
if s == "" {
|
||||
continue
|
||||
}
|
||||
for j, t := range layout {
|
||||
if i == j {
|
||||
continue
|
||||
}
|
||||
for k := bestk + 1; k <= len(s) && k <= len(t); k++ {
|
||||
if s[len(s)-k:] == t[:k] {
|
||||
besti = i
|
||||
bestj = j
|
||||
bestk = k
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if bestk > 0 {
|
||||
layout[besti] += layout[bestj][bestk:]
|
||||
layout[bestj] = ""
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
|
||||
text := strings.Join(layout, "")
|
||||
|
||||
atom := map[string]uint32{}
|
||||
for _, s := range all {
|
||||
off := strings.Index(text, s)
|
||||
if off < 0 {
|
||||
panic("lost string " + s)
|
||||
}
|
||||
atom[s] = uint32(off<<8 | len(s))
|
||||
}
|
||||
|
||||
// Generate the Go code.
|
||||
fmt.Printf("// generated by go run gen.go; DO NOT EDIT\n\n")
|
||||
fmt.Printf("package atom\n\nconst (\n")
|
||||
for _, s := range all {
|
||||
fmt.Printf("\t%s Atom = %#x\n", identifier(s), atom[s])
|
||||
}
|
||||
fmt.Printf(")\n\n")
|
||||
|
||||
fmt.Printf("const hash0 = %#x\n\n", best.h0)
|
||||
fmt.Printf("const maxAtomLen = %d\n\n", maxLen)
|
||||
|
||||
fmt.Printf("var table = [1<<%d]Atom{\n", best.k)
|
||||
for i, s := range best.tab {
|
||||
if s == "" {
|
||||
continue
|
||||
}
|
||||
fmt.Printf("\t%#x: %#x, // %s\n", i, atom[s], s)
|
||||
}
|
||||
fmt.Printf("}\n")
|
||||
datasize := (1 << best.k) * 4
|
||||
|
||||
fmt.Printf("const atomText =\n")
|
||||
textsize := len(text)
|
||||
for len(text) > 60 {
|
||||
fmt.Printf("\t%q +\n", text[:60])
|
||||
text = text[60:]
|
||||
}
|
||||
fmt.Printf("\t%q\n\n", text)
|
||||
|
||||
fmt.Fprintf(os.Stderr, "%d atoms; %d string bytes + %d tables = %d total data\n", len(all), textsize, datasize, textsize+datasize)
|
||||
}
|
||||
|
||||
type byLen []string
|
||||
|
||||
func (x byLen) Less(i, j int) bool { return len(x[i]) > len(x[j]) }
|
||||
func (x byLen) Swap(i, j int) { x[i], x[j] = x[j], x[i] }
|
||||
func (x byLen) Len() int { return len(x) }
|
||||
|
||||
// fnv computes the FNV hash with an arbitrary starting value h.
|
||||
func fnv(h uint32, s string) uint32 {
|
||||
for i := 0; i < len(s); i++ {
|
||||
h ^= uint32(s[i])
|
||||
h *= 16777619
|
||||
}
|
||||
return h
|
||||
}
|
||||
|
||||
// A table represents an attempt at constructing the lookup table.
|
||||
// The lookup table uses cuckoo hashing, meaning that each string
|
||||
// can be found in one of two positions.
|
||||
type table struct {
|
||||
h0 uint32
|
||||
k uint
|
||||
mask uint32
|
||||
tab []string
|
||||
}
|
||||
|
||||
// hash returns the two hashes for s.
|
||||
func (t *table) hash(s string) (h1, h2 uint32) {
|
||||
h := fnv(t.h0, s)
|
||||
h1 = h & t.mask
|
||||
h2 = (h >> 16) & t.mask
|
||||
return
|
||||
}
|
||||
|
||||
// init initializes the table with the given parameters.
|
||||
// h0 is the initial hash value,
|
||||
// k is the number of bits of hash value to use, and
|
||||
// x is the list of strings to store in the table.
|
||||
// init returns false if the table cannot be constructed.
|
||||
func (t *table) init(h0 uint32, k uint, x []string) bool {
|
||||
t.h0 = h0
|
||||
t.k = k
|
||||
t.tab = make([]string, 1<<k)
|
||||
t.mask = 1<<k - 1
|
||||
for _, s := range x {
|
||||
if !t.insert(s) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// insert inserts s in the table.
|
||||
func (t *table) insert(s string) bool {
|
||||
h1, h2 := t.hash(s)
|
||||
if t.tab[h1] == "" {
|
||||
t.tab[h1] = s
|
||||
return true
|
||||
}
|
||||
if t.tab[h2] == "" {
|
||||
t.tab[h2] = s
|
||||
return true
|
||||
}
|
||||
if t.push(h1, 0) {
|
||||
t.tab[h1] = s
|
||||
return true
|
||||
}
|
||||
if t.push(h2, 0) {
|
||||
t.tab[h2] = s
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// push attempts to push aside the entry in slot i.
|
||||
func (t *table) push(i uint32, depth int) bool {
|
||||
if depth > len(t.tab) {
|
||||
return false
|
||||
}
|
||||
s := t.tab[i]
|
||||
h1, h2 := t.hash(s)
|
||||
j := h1 + h2 - i
|
||||
if t.tab[j] != "" && !t.push(j, depth+1) {
|
||||
return false
|
||||
}
|
||||
t.tab[j] = s
|
||||
return true
|
||||
}
|
||||
|
||||
// The lists of element names and attribute keys were taken from
|
||||
// https://html.spec.whatwg.org/multipage/indices.html#index
|
||||
// as of the "HTML Living Standard - Last Updated 21 February 2015" version.
|
||||
|
||||
var elements = []string{
|
||||
"a",
|
||||
"abbr",
|
||||
"address",
|
||||
"area",
|
||||
"article",
|
||||
"aside",
|
||||
"audio",
|
||||
"b",
|
||||
"base",
|
||||
"bdi",
|
||||
"bdo",
|
||||
"blockquote",
|
||||
"body",
|
||||
"br",
|
||||
"button",
|
||||
"canvas",
|
||||
"caption",
|
||||
"cite",
|
||||
"code",
|
||||
"col",
|
||||
"colgroup",
|
||||
"command",
|
||||
"data",
|
||||
"datalist",
|
||||
"dd",
|
||||
"del",
|
||||
"details",
|
||||
"dfn",
|
||||
"dialog",
|
||||
"div",
|
||||
"dl",
|
||||
"dt",
|
||||
"em",
|
||||
"embed",
|
||||
"fieldset",
|
||||
"figcaption",
|
||||
"figure",
|
||||
"footer",
|
||||
"form",
|
||||
"h1",
|
||||
"h2",
|
||||
"h3",
|
||||
"h4",
|
||||
"h5",
|
||||
"h6",
|
||||
"head",
|
||||
"header",
|
||||
"hgroup",
|
||||
"hr",
|
||||
"html",
|
||||
"i",
|
||||
"iframe",
|
||||
"img",
|
||||
"input",
|
||||
"ins",
|
||||
"kbd",
|
||||
"keygen",
|
||||
"label",
|
||||
"legend",
|
||||
"li",
|
||||
"link",
|
||||
"map",
|
||||
"mark",
|
||||
"menu",
|
||||
"menuitem",
|
||||
"meta",
|
||||
"meter",
|
||||
"nav",
|
||||
"noscript",
|
||||
"object",
|
||||
"ol",
|
||||
"optgroup",
|
||||
"option",
|
||||
"output",
|
||||
"p",
|
||||
"param",
|
||||
"pre",
|
||||
"progress",
|
||||
"q",
|
||||
"rp",
|
||||
"rt",
|
||||
"ruby",
|
||||
"s",
|
||||
"samp",
|
||||
"script",
|
||||
"section",
|
||||
"select",
|
||||
"small",
|
||||
"source",
|
||||
"span",
|
||||
"strong",
|
||||
"style",
|
||||
"sub",
|
||||
"summary",
|
||||
"sup",
|
||||
"table",
|
||||
"tbody",
|
||||
"td",
|
||||
"template",
|
||||
"textarea",
|
||||
"tfoot",
|
||||
"th",
|
||||
"thead",
|
||||
"time",
|
||||
"title",
|
||||
"tr",
|
||||
"track",
|
||||
"u",
|
||||
"ul",
|
||||
"var",
|
||||
"video",
|
||||
"wbr",
|
||||
}
|
||||
|
||||
// https://html.spec.whatwg.org/multipage/indices.html#attributes-3
|
||||
|
||||
var attributes = []string{
|
||||
"abbr",
|
||||
"accept",
|
||||
"accept-charset",
|
||||
"accesskey",
|
||||
"action",
|
||||
"alt",
|
||||
"async",
|
||||
"autocomplete",
|
||||
"autofocus",
|
||||
"autoplay",
|
||||
"challenge",
|
||||
"charset",
|
||||
"checked",
|
||||
"cite",
|
||||
"class",
|
||||
"cols",
|
||||
"colspan",
|
||||
"command",
|
||||
"content",
|
||||
"contenteditable",
|
||||
"contextmenu",
|
||||
"controls",
|
||||
"coords",
|
||||
"crossorigin",
|
||||
"data",
|
||||
"datetime",
|
||||
"default",
|
||||
"defer",
|
||||
"dir",
|
||||
"dirname",
|
||||
"disabled",
|
||||
"download",
|
||||
"draggable",
|
||||
"dropzone",
|
||||
"enctype",
|
||||
"for",
|
||||
"form",
|
||||
"formaction",
|
||||
"formenctype",
|
||||
"formmethod",
|
||||
"formnovalidate",
|
||||
"formtarget",
|
||||
"headers",
|
||||
"height",
|
||||
"hidden",
|
||||
"high",
|
||||
"href",
|
||||
"hreflang",
|
||||
"http-equiv",
|
||||
"icon",
|
||||
"id",
|
||||
"inputmode",
|
||||
"ismap",
|
||||
"itemid",
|
||||
"itemprop",
|
||||
"itemref",
|
||||
"itemscope",
|
||||
"itemtype",
|
||||
"keytype",
|
||||
"kind",
|
||||
"label",
|
||||
"lang",
|
||||
"list",
|
||||
"loop",
|
||||
"low",
|
||||
"manifest",
|
||||
"max",
|
||||
"maxlength",
|
||||
"media",
|
||||
"mediagroup",
|
||||
"method",
|
||||
"min",
|
||||
"minlength",
|
||||
"multiple",
|
||||
"muted",
|
||||
"name",
|
||||
"novalidate",
|
||||
"open",
|
||||
"optimum",
|
||||
"pattern",
|
||||
"ping",
|
||||
"placeholder",
|
||||
"poster",
|
||||
"preload",
|
||||
"radiogroup",
|
||||
"readonly",
|
||||
"rel",
|
||||
"required",
|
||||
"reversed",
|
||||
"rows",
|
||||
"rowspan",
|
||||
"sandbox",
|
||||
"spellcheck",
|
||||
"scope",
|
||||
"scoped",
|
||||
"seamless",
|
||||
"selected",
|
||||
"shape",
|
||||
"size",
|
||||
"sizes",
|
||||
"sortable",
|
||||
"sorted",
|
||||
"span",
|
||||
"src",
|
||||
"srcdoc",
|
||||
"srclang",
|
||||
"start",
|
||||
"step",
|
||||
"style",
|
||||
"tabindex",
|
||||
"target",
|
||||
"title",
|
||||
"translate",
|
||||
"type",
|
||||
"typemustmatch",
|
||||
"usemap",
|
||||
"value",
|
||||
"width",
|
||||
"wrap",
|
||||
}
|
||||
|
||||
var eventHandlers = []string{
|
||||
"onabort",
|
||||
"onautocomplete",
|
||||
"onautocompleteerror",
|
||||
"onafterprint",
|
||||
"onbeforeprint",
|
||||
"onbeforeunload",
|
||||
"onblur",
|
||||
"oncancel",
|
||||
"oncanplay",
|
||||
"oncanplaythrough",
|
||||
"onchange",
|
||||
"onclick",
|
||||
"onclose",
|
||||
"oncontextmenu",
|
||||
"oncuechange",
|
||||
"ondblclick",
|
||||
"ondrag",
|
||||
"ondragend",
|
||||
"ondragenter",
|
||||
"ondragleave",
|
||||
"ondragover",
|
||||
"ondragstart",
|
||||
"ondrop",
|
||||
"ondurationchange",
|
||||
"onemptied",
|
||||
"onended",
|
||||
"onerror",
|
||||
"onfocus",
|
||||
"onhashchange",
|
||||
"oninput",
|
||||
"oninvalid",
|
||||
"onkeydown",
|
||||
"onkeypress",
|
||||
"onkeyup",
|
||||
"onlanguagechange",
|
||||
"onload",
|
||||
"onloadeddata",
|
||||
"onloadedmetadata",
|
||||
"onloadstart",
|
||||
"onmessage",
|
||||
"onmousedown",
|
||||
"onmousemove",
|
||||
"onmouseout",
|
||||
"onmouseover",
|
||||
"onmouseup",
|
||||
"onmousewheel",
|
||||
"onoffline",
|
||||
"ononline",
|
||||
"onpagehide",
|
||||
"onpageshow",
|
||||
"onpause",
|
||||
"onplay",
|
||||
"onplaying",
|
||||
"onpopstate",
|
||||
"onprogress",
|
||||
"onratechange",
|
||||
"onreset",
|
||||
"onresize",
|
||||
"onscroll",
|
||||
"onseeked",
|
||||
"onseeking",
|
||||
"onselect",
|
||||
"onshow",
|
||||
"onsort",
|
||||
"onstalled",
|
||||
"onstorage",
|
||||
"onsubmit",
|
||||
"onsuspend",
|
||||
"ontimeupdate",
|
||||
"ontoggle",
|
||||
"onunload",
|
||||
"onvolumechange",
|
||||
"onwaiting",
|
||||
}
|
||||
|
||||
// extra are ad-hoc values not covered by any of the lists above.
|
||||
var extra = []string{
|
||||
"align",
|
||||
"annotation",
|
||||
"annotation-xml",
|
||||
"applet",
|
||||
"basefont",
|
||||
"bgsound",
|
||||
"big",
|
||||
"blink",
|
||||
"center",
|
||||
"color",
|
||||
"desc",
|
||||
"face",
|
||||
"font",
|
||||
"foreignObject", // HTML is case-insensitive, but SVG-embedded-in-HTML is case-sensitive.
|
||||
"foreignobject",
|
||||
"frame",
|
||||
"frameset",
|
||||
"image",
|
||||
"isindex",
|
||||
"listing",
|
||||
"malignmark",
|
||||
"marquee",
|
||||
"math",
|
||||
"mglyph",
|
||||
"mi",
|
||||
"mn",
|
||||
"mo",
|
||||
"ms",
|
||||
"mtext",
|
||||
"nobr",
|
||||
"noembed",
|
||||
"noframes",
|
||||
"plaintext",
|
||||
"prompt",
|
||||
"public",
|
||||
"spacer",
|
||||
"strike",
|
||||
"svg",
|
||||
"system",
|
||||
"tt",
|
||||
"xmp",
|
||||
}
|
||||
+713
@@ -0,0 +1,713 @@
|
||||
// generated by go run gen.go; DO NOT EDIT
|
||||
|
||||
package atom
|
||||
|
||||
const (
|
||||
A Atom = 0x1
|
||||
Abbr Atom = 0x4
|
||||
Accept Atom = 0x2106
|
||||
AcceptCharset Atom = 0x210e
|
||||
Accesskey Atom = 0x3309
|
||||
Action Atom = 0x1f606
|
||||
Address Atom = 0x4f307
|
||||
Align Atom = 0x1105
|
||||
Alt Atom = 0x4503
|
||||
Annotation Atom = 0x1670a
|
||||
AnnotationXml Atom = 0x1670e
|
||||
Applet Atom = 0x2b306
|
||||
Area Atom = 0x2fa04
|
||||
Article Atom = 0x38807
|
||||
Aside Atom = 0x8305
|
||||
Async Atom = 0x7b05
|
||||
Audio Atom = 0xa605
|
||||
Autocomplete Atom = 0x1fc0c
|
||||
Autofocus Atom = 0xb309
|
||||
Autoplay Atom = 0xce08
|
||||
B Atom = 0x101
|
||||
Base Atom = 0xd604
|
||||
Basefont Atom = 0xd608
|
||||
Bdi Atom = 0x1a03
|
||||
Bdo Atom = 0xe703
|
||||
Bgsound Atom = 0x11807
|
||||
Big Atom = 0x12403
|
||||
Blink Atom = 0x12705
|
||||
Blockquote Atom = 0x12c0a
|
||||
Body Atom = 0x2f04
|
||||
Br Atom = 0x202
|
||||
Button Atom = 0x13606
|
||||
Canvas Atom = 0x7f06
|
||||
Caption Atom = 0x1bb07
|
||||
Center Atom = 0x5b506
|
||||
Challenge Atom = 0x21f09
|
||||
Charset Atom = 0x2807
|
||||
Checked Atom = 0x32807
|
||||
Cite Atom = 0x3c804
|
||||
Class Atom = 0x4de05
|
||||
Code Atom = 0x14904
|
||||
Col Atom = 0x15003
|
||||
Colgroup Atom = 0x15008
|
||||
Color Atom = 0x15d05
|
||||
Cols Atom = 0x16204
|
||||
Colspan Atom = 0x16207
|
||||
Command Atom = 0x17507
|
||||
Content Atom = 0x42307
|
||||
Contenteditable Atom = 0x4230f
|
||||
Contextmenu Atom = 0x3310b
|
||||
Controls Atom = 0x18808
|
||||
Coords Atom = 0x19406
|
||||
Crossorigin Atom = 0x19f0b
|
||||
Data Atom = 0x44a04
|
||||
Datalist Atom = 0x44a08
|
||||
Datetime Atom = 0x23c08
|
||||
Dd Atom = 0x26702
|
||||
Default Atom = 0x8607
|
||||
Defer Atom = 0x14b05
|
||||
Del Atom = 0x3ef03
|
||||
Desc Atom = 0x4db04
|
||||
Details Atom = 0x4807
|
||||
Dfn Atom = 0x6103
|
||||
Dialog Atom = 0x1b06
|
||||
Dir Atom = 0x6903
|
||||
Dirname Atom = 0x6907
|
||||
Disabled Atom = 0x10c08
|
||||
Div Atom = 0x11303
|
||||
Dl Atom = 0x11e02
|
||||
Download Atom = 0x40008
|
||||
Draggable Atom = 0x17b09
|
||||
Dropzone Atom = 0x39108
|
||||
Dt Atom = 0x50902
|
||||
Em Atom = 0x6502
|
||||
Embed Atom = 0x6505
|
||||
Enctype Atom = 0x21107
|
||||
Face Atom = 0x5b304
|
||||
Fieldset Atom = 0x1b008
|
||||
Figcaption Atom = 0x1b80a
|
||||
Figure Atom = 0x1cc06
|
||||
Font Atom = 0xda04
|
||||
Footer Atom = 0x8d06
|
||||
For Atom = 0x1d803
|
||||
ForeignObject Atom = 0x1d80d
|
||||
Foreignobject Atom = 0x1e50d
|
||||
Form Atom = 0x1f204
|
||||
Formaction Atom = 0x1f20a
|
||||
Formenctype Atom = 0x20d0b
|
||||
Formmethod Atom = 0x2280a
|
||||
Formnovalidate Atom = 0x2320e
|
||||
Formtarget Atom = 0x2470a
|
||||
Frame Atom = 0x9a05
|
||||
Frameset Atom = 0x9a08
|
||||
H1 Atom = 0x26e02
|
||||
H2 Atom = 0x29402
|
||||
H3 Atom = 0x2a702
|
||||
H4 Atom = 0x2e902
|
||||
H5 Atom = 0x2f302
|
||||
H6 Atom = 0x50b02
|
||||
Head Atom = 0x2d504
|
||||
Header Atom = 0x2d506
|
||||
Headers Atom = 0x2d507
|
||||
Height Atom = 0x25106
|
||||
Hgroup Atom = 0x25906
|
||||
Hidden Atom = 0x26506
|
||||
High Atom = 0x26b04
|
||||
Hr Atom = 0x27002
|
||||
Href Atom = 0x27004
|
||||
Hreflang Atom = 0x27008
|
||||
Html Atom = 0x25504
|
||||
HttpEquiv Atom = 0x2780a
|
||||
I Atom = 0x601
|
||||
Icon Atom = 0x42204
|
||||
Id Atom = 0x8502
|
||||
Iframe Atom = 0x29606
|
||||
Image Atom = 0x29c05
|
||||
Img Atom = 0x2a103
|
||||
Input Atom = 0x3e805
|
||||
Inputmode Atom = 0x3e809
|
||||
Ins Atom = 0x1a803
|
||||
Isindex Atom = 0x2a907
|
||||
Ismap Atom = 0x2b005
|
||||
Itemid Atom = 0x33c06
|
||||
Itemprop Atom = 0x3c908
|
||||
Itemref Atom = 0x5ad07
|
||||
Itemscope Atom = 0x2b909
|
||||
Itemtype Atom = 0x2c308
|
||||
Kbd Atom = 0x1903
|
||||
Keygen Atom = 0x3906
|
||||
Keytype Atom = 0x53707
|
||||
Kind Atom = 0x10904
|
||||
Label Atom = 0xf005
|
||||
Lang Atom = 0x27404
|
||||
Legend Atom = 0x18206
|
||||
Li Atom = 0x1202
|
||||
Link Atom = 0x12804
|
||||
List Atom = 0x44e04
|
||||
Listing Atom = 0x44e07
|
||||
Loop Atom = 0xf404
|
||||
Low Atom = 0x11f03
|
||||
Malignmark Atom = 0x100a
|
||||
Manifest Atom = 0x5f108
|
||||
Map Atom = 0x2b203
|
||||
Mark Atom = 0x1604
|
||||
Marquee Atom = 0x2cb07
|
||||
Math Atom = 0x2d204
|
||||
Max Atom = 0x2e103
|
||||
Maxlength Atom = 0x2e109
|
||||
Media Atom = 0x6e05
|
||||
Mediagroup Atom = 0x6e0a
|
||||
Menu Atom = 0x33804
|
||||
Menuitem Atom = 0x33808
|
||||
Meta Atom = 0x45d04
|
||||
Meter Atom = 0x24205
|
||||
Method Atom = 0x22c06
|
||||
Mglyph Atom = 0x2a206
|
||||
Mi Atom = 0x2eb02
|
||||
Min Atom = 0x2eb03
|
||||
Minlength Atom = 0x2eb09
|
||||
Mn Atom = 0x23502
|
||||
Mo Atom = 0x3ed02
|
||||
Ms Atom = 0x2bc02
|
||||
Mtext Atom = 0x2f505
|
||||
Multiple Atom = 0x30308
|
||||
Muted Atom = 0x30b05
|
||||
Name Atom = 0x6c04
|
||||
Nav Atom = 0x3e03
|
||||
Nobr Atom = 0x5704
|
||||
Noembed Atom = 0x6307
|
||||
Noframes Atom = 0x9808
|
||||
Noscript Atom = 0x3d208
|
||||
Novalidate Atom = 0x2360a
|
||||
Object Atom = 0x1ec06
|
||||
Ol Atom = 0xc902
|
||||
Onabort Atom = 0x13a07
|
||||
Onafterprint Atom = 0x1c00c
|
||||
Onautocomplete Atom = 0x1fa0e
|
||||
Onautocompleteerror Atom = 0x1fa13
|
||||
Onbeforeprint Atom = 0x6040d
|
||||
Onbeforeunload Atom = 0x4e70e
|
||||
Onblur Atom = 0xaa06
|
||||
Oncancel Atom = 0xe908
|
||||
Oncanplay Atom = 0x28509
|
||||
Oncanplaythrough Atom = 0x28510
|
||||
Onchange Atom = 0x3a708
|
||||
Onclick Atom = 0x31007
|
||||
Onclose Atom = 0x31707
|
||||
Oncontextmenu Atom = 0x32f0d
|
||||
Oncuechange Atom = 0x3420b
|
||||
Ondblclick Atom = 0x34d0a
|
||||
Ondrag Atom = 0x35706
|
||||
Ondragend Atom = 0x35709
|
||||
Ondragenter Atom = 0x3600b
|
||||
Ondragleave Atom = 0x36b0b
|
||||
Ondragover Atom = 0x3760a
|
||||
Ondragstart Atom = 0x3800b
|
||||
Ondrop Atom = 0x38f06
|
||||
Ondurationchange Atom = 0x39f10
|
||||
Onemptied Atom = 0x39609
|
||||
Onended Atom = 0x3af07
|
||||
Onerror Atom = 0x3b607
|
||||
Onfocus Atom = 0x3bd07
|
||||
Onhashchange Atom = 0x3da0c
|
||||
Oninput Atom = 0x3e607
|
||||
Oninvalid Atom = 0x3f209
|
||||
Onkeydown Atom = 0x3fb09
|
||||
Onkeypress Atom = 0x4080a
|
||||
Onkeyup Atom = 0x41807
|
||||
Onlanguagechange Atom = 0x43210
|
||||
Onload Atom = 0x44206
|
||||
Onloadeddata Atom = 0x4420c
|
||||
Onloadedmetadata Atom = 0x45510
|
||||
Onloadstart Atom = 0x46b0b
|
||||
Onmessage Atom = 0x47609
|
||||
Onmousedown Atom = 0x47f0b
|
||||
Onmousemove Atom = 0x48a0b
|
||||
Onmouseout Atom = 0x4950a
|
||||
Onmouseover Atom = 0x4a20b
|
||||
Onmouseup Atom = 0x4ad09
|
||||
Onmousewheel Atom = 0x4b60c
|
||||
Onoffline Atom = 0x4c209
|
||||
Ononline Atom = 0x4cb08
|
||||
Onpagehide Atom = 0x4d30a
|
||||
Onpageshow Atom = 0x4fe0a
|
||||
Onpause Atom = 0x50d07
|
||||
Onplay Atom = 0x51706
|
||||
Onplaying Atom = 0x51709
|
||||
Onpopstate Atom = 0x5200a
|
||||
Onprogress Atom = 0x52a0a
|
||||
Onratechange Atom = 0x53e0c
|
||||
Onreset Atom = 0x54a07
|
||||
Onresize Atom = 0x55108
|
||||
Onscroll Atom = 0x55f08
|
||||
Onseeked Atom = 0x56708
|
||||
Onseeking Atom = 0x56f09
|
||||
Onselect Atom = 0x57808
|
||||
Onshow Atom = 0x58206
|
||||
Onsort Atom = 0x58b06
|
||||
Onstalled Atom = 0x59509
|
||||
Onstorage Atom = 0x59e09
|
||||
Onsubmit Atom = 0x5a708
|
||||
Onsuspend Atom = 0x5bb09
|
||||
Ontimeupdate Atom = 0xdb0c
|
||||
Ontoggle Atom = 0x5c408
|
||||
Onunload Atom = 0x5cc08
|
||||
Onvolumechange Atom = 0x5d40e
|
||||
Onwaiting Atom = 0x5e209
|
||||
Open Atom = 0x3cf04
|
||||
Optgroup Atom = 0xf608
|
||||
Optimum Atom = 0x5eb07
|
||||
Option Atom = 0x60006
|
||||
Output Atom = 0x49c06
|
||||
P Atom = 0xc01
|
||||
Param Atom = 0xc05
|
||||
Pattern Atom = 0x5107
|
||||
Ping Atom = 0x7704
|
||||
Placeholder Atom = 0xc30b
|
||||
Plaintext Atom = 0xfd09
|
||||
Poster Atom = 0x15706
|
||||
Pre Atom = 0x25e03
|
||||
Preload Atom = 0x25e07
|
||||
Progress Atom = 0x52c08
|
||||
Prompt Atom = 0x5fa06
|
||||
Public Atom = 0x41e06
|
||||
Q Atom = 0x13101
|
||||
Radiogroup Atom = 0x30a
|
||||
Readonly Atom = 0x2fb08
|
||||
Rel Atom = 0x25f03
|
||||
Required Atom = 0x1d008
|
||||
Reversed Atom = 0x5a08
|
||||
Rows Atom = 0x9204
|
||||
Rowspan Atom = 0x9207
|
||||
Rp Atom = 0x1c602
|
||||
Rt Atom = 0x13f02
|
||||
Ruby Atom = 0xaf04
|
||||
S Atom = 0x2c01
|
||||
Samp Atom = 0x4e04
|
||||
Sandbox Atom = 0xbb07
|
||||
Scope Atom = 0x2bd05
|
||||
Scoped Atom = 0x2bd06
|
||||
Script Atom = 0x3d406
|
||||
Seamless Atom = 0x31c08
|
||||
Section Atom = 0x4e207
|
||||
Select Atom = 0x57a06
|
||||
Selected Atom = 0x57a08
|
||||
Shape Atom = 0x4f905
|
||||
Size Atom = 0x55504
|
||||
Sizes Atom = 0x55505
|
||||
Small Atom = 0x18f05
|
||||
Sortable Atom = 0x58d08
|
||||
Sorted Atom = 0x19906
|
||||
Source Atom = 0x1aa06
|
||||
Spacer Atom = 0x2db06
|
||||
Span Atom = 0x9504
|
||||
Spellcheck Atom = 0x3230a
|
||||
Src Atom = 0x3c303
|
||||
Srcdoc Atom = 0x3c306
|
||||
Srclang Atom = 0x41107
|
||||
Start Atom = 0x38605
|
||||
Step Atom = 0x5f704
|
||||
Strike Atom = 0x53306
|
||||
Strong Atom = 0x55906
|
||||
Style Atom = 0x61105
|
||||
Sub Atom = 0x5a903
|
||||
Summary Atom = 0x61607
|
||||
Sup Atom = 0x61d03
|
||||
Svg Atom = 0x62003
|
||||
System Atom = 0x62306
|
||||
Tabindex Atom = 0x46308
|
||||
Table Atom = 0x42d05
|
||||
Target Atom = 0x24b06
|
||||
Tbody Atom = 0x2e05
|
||||
Td Atom = 0x4702
|
||||
Template Atom = 0x62608
|
||||
Textarea Atom = 0x2f608
|
||||
Tfoot Atom = 0x8c05
|
||||
Th Atom = 0x22e02
|
||||
Thead Atom = 0x2d405
|
||||
Time Atom = 0xdd04
|
||||
Title Atom = 0xa105
|
||||
Tr Atom = 0x10502
|
||||
Track Atom = 0x10505
|
||||
Translate Atom = 0x14009
|
||||
Tt Atom = 0x5302
|
||||
Type Atom = 0x21404
|
||||
Typemustmatch Atom = 0x2140d
|
||||
U Atom = 0xb01
|
||||
Ul Atom = 0x8a02
|
||||
Usemap Atom = 0x51106
|
||||
Value Atom = 0x4005
|
||||
Var Atom = 0x11503
|
||||
Video Atom = 0x28105
|
||||
Wbr Atom = 0x12103
|
||||
Width Atom = 0x50705
|
||||
Wrap Atom = 0x58704
|
||||
Xmp Atom = 0xc103
|
||||
)
|
||||
|
||||
const hash0 = 0xc17da63e
|
||||
|
||||
const maxAtomLen = 19
|
||||
|
||||
var table = [1 << 9]Atom{
|
||||
0x1: 0x48a0b, // onmousemove
|
||||
0x2: 0x5e209, // onwaiting
|
||||
0x3: 0x1fa13, // onautocompleteerror
|
||||
0x4: 0x5fa06, // prompt
|
||||
0x7: 0x5eb07, // optimum
|
||||
0x8: 0x1604, // mark
|
||||
0xa: 0x5ad07, // itemref
|
||||
0xb: 0x4fe0a, // onpageshow
|
||||
0xc: 0x57a06, // select
|
||||
0xd: 0x17b09, // draggable
|
||||
0xe: 0x3e03, // nav
|
||||
0xf: 0x17507, // command
|
||||
0x11: 0xb01, // u
|
||||
0x14: 0x2d507, // headers
|
||||
0x15: 0x44a08, // datalist
|
||||
0x17: 0x4e04, // samp
|
||||
0x1a: 0x3fb09, // onkeydown
|
||||
0x1b: 0x55f08, // onscroll
|
||||
0x1c: 0x15003, // col
|
||||
0x20: 0x3c908, // itemprop
|
||||
0x21: 0x2780a, // http-equiv
|
||||
0x22: 0x61d03, // sup
|
||||
0x24: 0x1d008, // required
|
||||
0x2b: 0x25e07, // preload
|
||||
0x2c: 0x6040d, // onbeforeprint
|
||||
0x2d: 0x3600b, // ondragenter
|
||||
0x2e: 0x50902, // dt
|
||||
0x2f: 0x5a708, // onsubmit
|
||||
0x30: 0x27002, // hr
|
||||
0x31: 0x32f0d, // oncontextmenu
|
||||
0x33: 0x29c05, // image
|
||||
0x34: 0x50d07, // onpause
|
||||
0x35: 0x25906, // hgroup
|
||||
0x36: 0x7704, // ping
|
||||
0x37: 0x57808, // onselect
|
||||
0x3a: 0x11303, // div
|
||||
0x3b: 0x1fa0e, // onautocomplete
|
||||
0x40: 0x2eb02, // mi
|
||||
0x41: 0x31c08, // seamless
|
||||
0x42: 0x2807, // charset
|
||||
0x43: 0x8502, // id
|
||||
0x44: 0x5200a, // onpopstate
|
||||
0x45: 0x3ef03, // del
|
||||
0x46: 0x2cb07, // marquee
|
||||
0x47: 0x3309, // accesskey
|
||||
0x49: 0x8d06, // footer
|
||||
0x4a: 0x44e04, // list
|
||||
0x4b: 0x2b005, // ismap
|
||||
0x51: 0x33804, // menu
|
||||
0x52: 0x2f04, // body
|
||||
0x55: 0x9a08, // frameset
|
||||
0x56: 0x54a07, // onreset
|
||||
0x57: 0x12705, // blink
|
||||
0x58: 0xa105, // title
|
||||
0x59: 0x38807, // article
|
||||
0x5b: 0x22e02, // th
|
||||
0x5d: 0x13101, // q
|
||||
0x5e: 0x3cf04, // open
|
||||
0x5f: 0x2fa04, // area
|
||||
0x61: 0x44206, // onload
|
||||
0x62: 0xda04, // font
|
||||
0x63: 0xd604, // base
|
||||
0x64: 0x16207, // colspan
|
||||
0x65: 0x53707, // keytype
|
||||
0x66: 0x11e02, // dl
|
||||
0x68: 0x1b008, // fieldset
|
||||
0x6a: 0x2eb03, // min
|
||||
0x6b: 0x11503, // var
|
||||
0x6f: 0x2d506, // header
|
||||
0x70: 0x13f02, // rt
|
||||
0x71: 0x15008, // colgroup
|
||||
0x72: 0x23502, // mn
|
||||
0x74: 0x13a07, // onabort
|
||||
0x75: 0x3906, // keygen
|
||||
0x76: 0x4c209, // onoffline
|
||||
0x77: 0x21f09, // challenge
|
||||
0x78: 0x2b203, // map
|
||||
0x7a: 0x2e902, // h4
|
||||
0x7b: 0x3b607, // onerror
|
||||
0x7c: 0x2e109, // maxlength
|
||||
0x7d: 0x2f505, // mtext
|
||||
0x7e: 0xbb07, // sandbox
|
||||
0x7f: 0x58b06, // onsort
|
||||
0x80: 0x100a, // malignmark
|
||||
0x81: 0x45d04, // meta
|
||||
0x82: 0x7b05, // async
|
||||
0x83: 0x2a702, // h3
|
||||
0x84: 0x26702, // dd
|
||||
0x85: 0x27004, // href
|
||||
0x86: 0x6e0a, // mediagroup
|
||||
0x87: 0x19406, // coords
|
||||
0x88: 0x41107, // srclang
|
||||
0x89: 0x34d0a, // ondblclick
|
||||
0x8a: 0x4005, // value
|
||||
0x8c: 0xe908, // oncancel
|
||||
0x8e: 0x3230a, // spellcheck
|
||||
0x8f: 0x9a05, // frame
|
||||
0x91: 0x12403, // big
|
||||
0x94: 0x1f606, // action
|
||||
0x95: 0x6903, // dir
|
||||
0x97: 0x2fb08, // readonly
|
||||
0x99: 0x42d05, // table
|
||||
0x9a: 0x61607, // summary
|
||||
0x9b: 0x12103, // wbr
|
||||
0x9c: 0x30a, // radiogroup
|
||||
0x9d: 0x6c04, // name
|
||||
0x9f: 0x62306, // system
|
||||
0xa1: 0x15d05, // color
|
||||
0xa2: 0x7f06, // canvas
|
||||
0xa3: 0x25504, // html
|
||||
0xa5: 0x56f09, // onseeking
|
||||
0xac: 0x4f905, // shape
|
||||
0xad: 0x25f03, // rel
|
||||
0xae: 0x28510, // oncanplaythrough
|
||||
0xaf: 0x3760a, // ondragover
|
||||
0xb0: 0x62608, // template
|
||||
0xb1: 0x1d80d, // foreignObject
|
||||
0xb3: 0x9204, // rows
|
||||
0xb6: 0x44e07, // listing
|
||||
0xb7: 0x49c06, // output
|
||||
0xb9: 0x3310b, // contextmenu
|
||||
0xbb: 0x11f03, // low
|
||||
0xbc: 0x1c602, // rp
|
||||
0xbd: 0x5bb09, // onsuspend
|
||||
0xbe: 0x13606, // button
|
||||
0xbf: 0x4db04, // desc
|
||||
0xc1: 0x4e207, // section
|
||||
0xc2: 0x52a0a, // onprogress
|
||||
0xc3: 0x59e09, // onstorage
|
||||
0xc4: 0x2d204, // math
|
||||
0xc5: 0x4503, // alt
|
||||
0xc7: 0x8a02, // ul
|
||||
0xc8: 0x5107, // pattern
|
||||
0xc9: 0x4b60c, // onmousewheel
|
||||
0xca: 0x35709, // ondragend
|
||||
0xcb: 0xaf04, // ruby
|
||||
0xcc: 0xc01, // p
|
||||
0xcd: 0x31707, // onclose
|
||||
0xce: 0x24205, // meter
|
||||
0xcf: 0x11807, // bgsound
|
||||
0xd2: 0x25106, // height
|
||||
0xd4: 0x101, // b
|
||||
0xd5: 0x2c308, // itemtype
|
||||
0xd8: 0x1bb07, // caption
|
||||
0xd9: 0x10c08, // disabled
|
||||
0xdb: 0x33808, // menuitem
|
||||
0xdc: 0x62003, // svg
|
||||
0xdd: 0x18f05, // small
|
||||
0xde: 0x44a04, // data
|
||||
0xe0: 0x4cb08, // ononline
|
||||
0xe1: 0x2a206, // mglyph
|
||||
0xe3: 0x6505, // embed
|
||||
0xe4: 0x10502, // tr
|
||||
0xe5: 0x46b0b, // onloadstart
|
||||
0xe7: 0x3c306, // srcdoc
|
||||
0xeb: 0x5c408, // ontoggle
|
||||
0xed: 0xe703, // bdo
|
||||
0xee: 0x4702, // td
|
||||
0xef: 0x8305, // aside
|
||||
0xf0: 0x29402, // h2
|
||||
0xf1: 0x52c08, // progress
|
||||
0xf2: 0x12c0a, // blockquote
|
||||
0xf4: 0xf005, // label
|
||||
0xf5: 0x601, // i
|
||||
0xf7: 0x9207, // rowspan
|
||||
0xfb: 0x51709, // onplaying
|
||||
0xfd: 0x2a103, // img
|
||||
0xfe: 0xf608, // optgroup
|
||||
0xff: 0x42307, // content
|
||||
0x101: 0x53e0c, // onratechange
|
||||
0x103: 0x3da0c, // onhashchange
|
||||
0x104: 0x4807, // details
|
||||
0x106: 0x40008, // download
|
||||
0x109: 0x14009, // translate
|
||||
0x10b: 0x4230f, // contenteditable
|
||||
0x10d: 0x36b0b, // ondragleave
|
||||
0x10e: 0x2106, // accept
|
||||
0x10f: 0x57a08, // selected
|
||||
0x112: 0x1f20a, // formaction
|
||||
0x113: 0x5b506, // center
|
||||
0x115: 0x45510, // onloadedmetadata
|
||||
0x116: 0x12804, // link
|
||||
0x117: 0xdd04, // time
|
||||
0x118: 0x19f0b, // crossorigin
|
||||
0x119: 0x3bd07, // onfocus
|
||||
0x11a: 0x58704, // wrap
|
||||
0x11b: 0x42204, // icon
|
||||
0x11d: 0x28105, // video
|
||||
0x11e: 0x4de05, // class
|
||||
0x121: 0x5d40e, // onvolumechange
|
||||
0x122: 0xaa06, // onblur
|
||||
0x123: 0x2b909, // itemscope
|
||||
0x124: 0x61105, // style
|
||||
0x127: 0x41e06, // public
|
||||
0x129: 0x2320e, // formnovalidate
|
||||
0x12a: 0x58206, // onshow
|
||||
0x12c: 0x51706, // onplay
|
||||
0x12d: 0x3c804, // cite
|
||||
0x12e: 0x2bc02, // ms
|
||||
0x12f: 0xdb0c, // ontimeupdate
|
||||
0x130: 0x10904, // kind
|
||||
0x131: 0x2470a, // formtarget
|
||||
0x135: 0x3af07, // onended
|
||||
0x136: 0x26506, // hidden
|
||||
0x137: 0x2c01, // s
|
||||
0x139: 0x2280a, // formmethod
|
||||
0x13a: 0x3e805, // input
|
||||
0x13c: 0x50b02, // h6
|
||||
0x13d: 0xc902, // ol
|
||||
0x13e: 0x3420b, // oncuechange
|
||||
0x13f: 0x1e50d, // foreignobject
|
||||
0x143: 0x4e70e, // onbeforeunload
|
||||
0x144: 0x2bd05, // scope
|
||||
0x145: 0x39609, // onemptied
|
||||
0x146: 0x14b05, // defer
|
||||
0x147: 0xc103, // xmp
|
||||
0x148: 0x39f10, // ondurationchange
|
||||
0x149: 0x1903, // kbd
|
||||
0x14c: 0x47609, // onmessage
|
||||
0x14d: 0x60006, // option
|
||||
0x14e: 0x2eb09, // minlength
|
||||
0x14f: 0x32807, // checked
|
||||
0x150: 0xce08, // autoplay
|
||||
0x152: 0x202, // br
|
||||
0x153: 0x2360a, // novalidate
|
||||
0x156: 0x6307, // noembed
|
||||
0x159: 0x31007, // onclick
|
||||
0x15a: 0x47f0b, // onmousedown
|
||||
0x15b: 0x3a708, // onchange
|
||||
0x15e: 0x3f209, // oninvalid
|
||||
0x15f: 0x2bd06, // scoped
|
||||
0x160: 0x18808, // controls
|
||||
0x161: 0x30b05, // muted
|
||||
0x162: 0x58d08, // sortable
|
||||
0x163: 0x51106, // usemap
|
||||
0x164: 0x1b80a, // figcaption
|
||||
0x165: 0x35706, // ondrag
|
||||
0x166: 0x26b04, // high
|
||||
0x168: 0x3c303, // src
|
||||
0x169: 0x15706, // poster
|
||||
0x16b: 0x1670e, // annotation-xml
|
||||
0x16c: 0x5f704, // step
|
||||
0x16d: 0x4, // abbr
|
||||
0x16e: 0x1b06, // dialog
|
||||
0x170: 0x1202, // li
|
||||
0x172: 0x3ed02, // mo
|
||||
0x175: 0x1d803, // for
|
||||
0x176: 0x1a803, // ins
|
||||
0x178: 0x55504, // size
|
||||
0x179: 0x43210, // onlanguagechange
|
||||
0x17a: 0x8607, // default
|
||||
0x17b: 0x1a03, // bdi
|
||||
0x17c: 0x4d30a, // onpagehide
|
||||
0x17d: 0x6907, // dirname
|
||||
0x17e: 0x21404, // type
|
||||
0x17f: 0x1f204, // form
|
||||
0x181: 0x28509, // oncanplay
|
||||
0x182: 0x6103, // dfn
|
||||
0x183: 0x46308, // tabindex
|
||||
0x186: 0x6502, // em
|
||||
0x187: 0x27404, // lang
|
||||
0x189: 0x39108, // dropzone
|
||||
0x18a: 0x4080a, // onkeypress
|
||||
0x18b: 0x23c08, // datetime
|
||||
0x18c: 0x16204, // cols
|
||||
0x18d: 0x1, // a
|
||||
0x18e: 0x4420c, // onloadeddata
|
||||
0x190: 0xa605, // audio
|
||||
0x192: 0x2e05, // tbody
|
||||
0x193: 0x22c06, // method
|
||||
0x195: 0xf404, // loop
|
||||
0x196: 0x29606, // iframe
|
||||
0x198: 0x2d504, // head
|
||||
0x19e: 0x5f108, // manifest
|
||||
0x19f: 0xb309, // autofocus
|
||||
0x1a0: 0x14904, // code
|
||||
0x1a1: 0x55906, // strong
|
||||
0x1a2: 0x30308, // multiple
|
||||
0x1a3: 0xc05, // param
|
||||
0x1a6: 0x21107, // enctype
|
||||
0x1a7: 0x5b304, // face
|
||||
0x1a8: 0xfd09, // plaintext
|
||||
0x1a9: 0x26e02, // h1
|
||||
0x1aa: 0x59509, // onstalled
|
||||
0x1ad: 0x3d406, // script
|
||||
0x1ae: 0x2db06, // spacer
|
||||
0x1af: 0x55108, // onresize
|
||||
0x1b0: 0x4a20b, // onmouseover
|
||||
0x1b1: 0x5cc08, // onunload
|
||||
0x1b2: 0x56708, // onseeked
|
||||
0x1b4: 0x2140d, // typemustmatch
|
||||
0x1b5: 0x1cc06, // figure
|
||||
0x1b6: 0x4950a, // onmouseout
|
||||
0x1b7: 0x25e03, // pre
|
||||
0x1b8: 0x50705, // width
|
||||
0x1b9: 0x19906, // sorted
|
||||
0x1bb: 0x5704, // nobr
|
||||
0x1be: 0x5302, // tt
|
||||
0x1bf: 0x1105, // align
|
||||
0x1c0: 0x3e607, // oninput
|
||||
0x1c3: 0x41807, // onkeyup
|
||||
0x1c6: 0x1c00c, // onafterprint
|
||||
0x1c7: 0x210e, // accept-charset
|
||||
0x1c8: 0x33c06, // itemid
|
||||
0x1c9: 0x3e809, // inputmode
|
||||
0x1cb: 0x53306, // strike
|
||||
0x1cc: 0x5a903, // sub
|
||||
0x1cd: 0x10505, // track
|
||||
0x1ce: 0x38605, // start
|
||||
0x1d0: 0xd608, // basefont
|
||||
0x1d6: 0x1aa06, // source
|
||||
0x1d7: 0x18206, // legend
|
||||
0x1d8: 0x2d405, // thead
|
||||
0x1da: 0x8c05, // tfoot
|
||||
0x1dd: 0x1ec06, // object
|
||||
0x1de: 0x6e05, // media
|
||||
0x1df: 0x1670a, // annotation
|
||||
0x1e0: 0x20d0b, // formenctype
|
||||
0x1e2: 0x3d208, // noscript
|
||||
0x1e4: 0x55505, // sizes
|
||||
0x1e5: 0x1fc0c, // autocomplete
|
||||
0x1e6: 0x9504, // span
|
||||
0x1e7: 0x9808, // noframes
|
||||
0x1e8: 0x24b06, // target
|
||||
0x1e9: 0x38f06, // ondrop
|
||||
0x1ea: 0x2b306, // applet
|
||||
0x1ec: 0x5a08, // reversed
|
||||
0x1f0: 0x2a907, // isindex
|
||||
0x1f3: 0x27008, // hreflang
|
||||
0x1f5: 0x2f302, // h5
|
||||
0x1f6: 0x4f307, // address
|
||||
0x1fa: 0x2e103, // max
|
||||
0x1fb: 0xc30b, // placeholder
|
||||
0x1fc: 0x2f608, // textarea
|
||||
0x1fe: 0x4ad09, // onmouseup
|
||||
0x1ff: 0x3800b, // ondragstart
|
||||
}
|
||||
|
||||
const atomText = "abbradiogrouparamalignmarkbdialogaccept-charsetbodyaccesskey" +
|
||||
"genavaluealtdetailsampatternobreversedfnoembedirnamediagroup" +
|
||||
"ingasyncanvasidefaultfooterowspanoframesetitleaudionblurubya" +
|
||||
"utofocusandboxmplaceholderautoplaybasefontimeupdatebdoncance" +
|
||||
"labelooptgrouplaintextrackindisabledivarbgsoundlowbrbigblink" +
|
||||
"blockquotebuttonabortranslatecodefercolgroupostercolorcolspa" +
|
||||
"nnotation-xmlcommandraggablegendcontrolsmallcoordsortedcross" +
|
||||
"originsourcefieldsetfigcaptionafterprintfigurequiredforeignO" +
|
||||
"bjectforeignobjectformactionautocompleteerrorformenctypemust" +
|
||||
"matchallengeformmethodformnovalidatetimeterformtargetheightm" +
|
||||
"lhgroupreloadhiddenhigh1hreflanghttp-equivideoncanplaythroug" +
|
||||
"h2iframeimageimglyph3isindexismappletitemscopeditemtypemarqu" +
|
||||
"eematheaderspacermaxlength4minlength5mtextareadonlymultiplem" +
|
||||
"utedonclickoncloseamlesspellcheckedoncontextmenuitemidoncuec" +
|
||||
"hangeondblclickondragendondragenterondragleaveondragoverondr" +
|
||||
"agstarticleondropzonemptiedondurationchangeonendedonerroronf" +
|
||||
"ocusrcdocitempropenoscriptonhashchangeoninputmodeloninvalido" +
|
||||
"nkeydownloadonkeypressrclangonkeyupublicontenteditableonlang" +
|
||||
"uagechangeonloadeddatalistingonloadedmetadatabindexonloadsta" +
|
||||
"rtonmessageonmousedownonmousemoveonmouseoutputonmouseoveronm" +
|
||||
"ouseuponmousewheelonofflineononlineonpagehidesclassectionbef" +
|
||||
"oreunloaddresshapeonpageshowidth6onpausemaponplayingonpopsta" +
|
||||
"teonprogresstrikeytypeonratechangeonresetonresizestrongonscr" +
|
||||
"ollonseekedonseekingonselectedonshowraponsortableonstalledon" +
|
||||
"storageonsubmitemrefacenteronsuspendontoggleonunloadonvolume" +
|
||||
"changeonwaitingoptimumanifestepromptoptionbeforeprintstylesu" +
|
||||
"mmarysupsvgsystemplate"
|
||||
+257
@@ -0,0 +1,257 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package charset provides common text encodings for HTML documents.
|
||||
//
|
||||
// The mapping from encoding labels to encodings is defined at
|
||||
// https://encoding.spec.whatwg.org/.
|
||||
package charset // import "golang.org/x/net/html/charset"
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"mime"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
"golang.org/x/text/encoding"
|
||||
"golang.org/x/text/encoding/charmap"
|
||||
"golang.org/x/text/encoding/htmlindex"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// Lookup returns the encoding with the specified label, and its canonical
|
||||
// name. It returns nil and the empty string if label is not one of the
|
||||
// standard encodings for HTML. Matching is case-insensitive and ignores
|
||||
// leading and trailing whitespace. Encoders will use HTML escape sequences for
|
||||
// runes that are not supported by the character set.
|
||||
func Lookup(label string) (e encoding.Encoding, name string) {
|
||||
e, err := htmlindex.Get(label)
|
||||
if err != nil {
|
||||
return nil, ""
|
||||
}
|
||||
name, _ = htmlindex.Name(e)
|
||||
return &htmlEncoding{e}, name
|
||||
}
|
||||
|
||||
type htmlEncoding struct{ encoding.Encoding }
|
||||
|
||||
func (h *htmlEncoding) NewEncoder() *encoding.Encoder {
|
||||
// HTML requires a non-terminating legacy encoder. We use HTML escapes to
|
||||
// substitute unsupported code points.
|
||||
return encoding.HTMLEscapeUnsupported(h.Encoding.NewEncoder())
|
||||
}
|
||||
|
||||
// DetermineEncoding determines the encoding of an HTML document by examining
|
||||
// up to the first 1024 bytes of content and the declared Content-Type.
|
||||
//
|
||||
// See http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#determining-the-character-encoding
|
||||
func DetermineEncoding(content []byte, contentType string) (e encoding.Encoding, name string, certain bool) {
|
||||
if len(content) > 1024 {
|
||||
content = content[:1024]
|
||||
}
|
||||
|
||||
for _, b := range boms {
|
||||
if bytes.HasPrefix(content, b.bom) {
|
||||
e, name = Lookup(b.enc)
|
||||
return e, name, true
|
||||
}
|
||||
}
|
||||
|
||||
if _, params, err := mime.ParseMediaType(contentType); err == nil {
|
||||
if cs, ok := params["charset"]; ok {
|
||||
if e, name = Lookup(cs); e != nil {
|
||||
return e, name, true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(content) > 0 {
|
||||
e, name = prescan(content)
|
||||
if e != nil {
|
||||
return e, name, false
|
||||
}
|
||||
}
|
||||
|
||||
// Try to detect UTF-8.
|
||||
// First eliminate any partial rune at the end.
|
||||
for i := len(content) - 1; i >= 0 && i > len(content)-4; i-- {
|
||||
b := content[i]
|
||||
if b < 0x80 {
|
||||
break
|
||||
}
|
||||
if utf8.RuneStart(b) {
|
||||
content = content[:i]
|
||||
break
|
||||
}
|
||||
}
|
||||
hasHighBit := false
|
||||
for _, c := range content {
|
||||
if c >= 0x80 {
|
||||
hasHighBit = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if hasHighBit && utf8.Valid(content) {
|
||||
return encoding.Nop, "utf-8", false
|
||||
}
|
||||
|
||||
// TODO: change default depending on user's locale?
|
||||
return charmap.Windows1252, "windows-1252", false
|
||||
}
|
||||
|
||||
// NewReader returns an io.Reader that converts the content of r to UTF-8.
|
||||
// It calls DetermineEncoding to find out what r's encoding is.
|
||||
func NewReader(r io.Reader, contentType string) (io.Reader, error) {
|
||||
preview := make([]byte, 1024)
|
||||
n, err := io.ReadFull(r, preview)
|
||||
switch {
|
||||
case err == io.ErrUnexpectedEOF:
|
||||
preview = preview[:n]
|
||||
r = bytes.NewReader(preview)
|
||||
case err != nil:
|
||||
return nil, err
|
||||
default:
|
||||
r = io.MultiReader(bytes.NewReader(preview), r)
|
||||
}
|
||||
|
||||
if e, _, _ := DetermineEncoding(preview, contentType); e != encoding.Nop {
|
||||
r = transform.NewReader(r, e.NewDecoder())
|
||||
}
|
||||
return r, nil
|
||||
}
|
||||
|
||||
// NewReaderLabel returns a reader that converts from the specified charset to
|
||||
// UTF-8. It uses Lookup to find the encoding that corresponds to label, and
|
||||
// returns an error if Lookup returns nil. It is suitable for use as
|
||||
// encoding/xml.Decoder's CharsetReader function.
|
||||
func NewReaderLabel(label string, input io.Reader) (io.Reader, error) {
|
||||
e, _ := Lookup(label)
|
||||
if e == nil {
|
||||
return nil, fmt.Errorf("unsupported charset: %q", label)
|
||||
}
|
||||
return transform.NewReader(input, e.NewDecoder()), nil
|
||||
}
|
||||
|
||||
func prescan(content []byte) (e encoding.Encoding, name string) {
|
||||
z := html.NewTokenizer(bytes.NewReader(content))
|
||||
for {
|
||||
switch z.Next() {
|
||||
case html.ErrorToken:
|
||||
return nil, ""
|
||||
|
||||
case html.StartTagToken, html.SelfClosingTagToken:
|
||||
tagName, hasAttr := z.TagName()
|
||||
if !bytes.Equal(tagName, []byte("meta")) {
|
||||
continue
|
||||
}
|
||||
attrList := make(map[string]bool)
|
||||
gotPragma := false
|
||||
|
||||
const (
|
||||
dontKnow = iota
|
||||
doNeedPragma
|
||||
doNotNeedPragma
|
||||
)
|
||||
needPragma := dontKnow
|
||||
|
||||
name = ""
|
||||
e = nil
|
||||
for hasAttr {
|
||||
var key, val []byte
|
||||
key, val, hasAttr = z.TagAttr()
|
||||
ks := string(key)
|
||||
if attrList[ks] {
|
||||
continue
|
||||
}
|
||||
attrList[ks] = true
|
||||
for i, c := range val {
|
||||
if 'A' <= c && c <= 'Z' {
|
||||
val[i] = c + 0x20
|
||||
}
|
||||
}
|
||||
|
||||
switch ks {
|
||||
case "http-equiv":
|
||||
if bytes.Equal(val, []byte("content-type")) {
|
||||
gotPragma = true
|
||||
}
|
||||
|
||||
case "content":
|
||||
if e == nil {
|
||||
name = fromMetaElement(string(val))
|
||||
if name != "" {
|
||||
e, name = Lookup(name)
|
||||
if e != nil {
|
||||
needPragma = doNeedPragma
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
case "charset":
|
||||
e, name = Lookup(string(val))
|
||||
needPragma = doNotNeedPragma
|
||||
}
|
||||
}
|
||||
|
||||
if needPragma == dontKnow || needPragma == doNeedPragma && !gotPragma {
|
||||
continue
|
||||
}
|
||||
|
||||
if strings.HasPrefix(name, "utf-16") {
|
||||
name = "utf-8"
|
||||
e = encoding.Nop
|
||||
}
|
||||
|
||||
if e != nil {
|
||||
return e, name
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func fromMetaElement(s string) string {
|
||||
for s != "" {
|
||||
csLoc := strings.Index(s, "charset")
|
||||
if csLoc == -1 {
|
||||
return ""
|
||||
}
|
||||
s = s[csLoc+len("charset"):]
|
||||
s = strings.TrimLeft(s, " \t\n\f\r")
|
||||
if !strings.HasPrefix(s, "=") {
|
||||
continue
|
||||
}
|
||||
s = s[1:]
|
||||
s = strings.TrimLeft(s, " \t\n\f\r")
|
||||
if s == "" {
|
||||
return ""
|
||||
}
|
||||
if q := s[0]; q == '"' || q == '\'' {
|
||||
s = s[1:]
|
||||
closeQuote := strings.IndexRune(s, rune(q))
|
||||
if closeQuote == -1 {
|
||||
return ""
|
||||
}
|
||||
return s[:closeQuote]
|
||||
}
|
||||
|
||||
end := strings.IndexAny(s, "; \t\n\f\r")
|
||||
if end == -1 {
|
||||
end = len(s)
|
||||
}
|
||||
return s[:end]
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
var boms = []struct {
|
||||
bom []byte
|
||||
enc string
|
||||
}{
|
||||
{[]byte{0xfe, 0xff}, "utf-16be"},
|
||||
{[]byte{0xff, 0xfe}, "utf-16le"},
|
||||
{[]byte{0xef, 0xbb, 0xbf}, "utf-8"},
|
||||
}
|
||||
+102
@@ -0,0 +1,102 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package html
|
||||
|
||||
// Section 12.2.3.2 of the HTML5 specification says "The following elements
|
||||
// have varying levels of special parsing rules".
|
||||
// https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements
|
||||
var isSpecialElementMap = map[string]bool{
|
||||
"address": true,
|
||||
"applet": true,
|
||||
"area": true,
|
||||
"article": true,
|
||||
"aside": true,
|
||||
"base": true,
|
||||
"basefont": true,
|
||||
"bgsound": true,
|
||||
"blockquote": true,
|
||||
"body": true,
|
||||
"br": true,
|
||||
"button": true,
|
||||
"caption": true,
|
||||
"center": true,
|
||||
"col": true,
|
||||
"colgroup": true,
|
||||
"dd": true,
|
||||
"details": true,
|
||||
"dir": true,
|
||||
"div": true,
|
||||
"dl": true,
|
||||
"dt": true,
|
||||
"embed": true,
|
||||
"fieldset": true,
|
||||
"figcaption": true,
|
||||
"figure": true,
|
||||
"footer": true,
|
||||
"form": true,
|
||||
"frame": true,
|
||||
"frameset": true,
|
||||
"h1": true,
|
||||
"h2": true,
|
||||
"h3": true,
|
||||
"h4": true,
|
||||
"h5": true,
|
||||
"h6": true,
|
||||
"head": true,
|
||||
"header": true,
|
||||
"hgroup": true,
|
||||
"hr": true,
|
||||
"html": true,
|
||||
"iframe": true,
|
||||
"img": true,
|
||||
"input": true,
|
||||
"isindex": true,
|
||||
"li": true,
|
||||
"link": true,
|
||||
"listing": true,
|
||||
"marquee": true,
|
||||
"menu": true,
|
||||
"meta": true,
|
||||
"nav": true,
|
||||
"noembed": true,
|
||||
"noframes": true,
|
||||
"noscript": true,
|
||||
"object": true,
|
||||
"ol": true,
|
||||
"p": true,
|
||||
"param": true,
|
||||
"plaintext": true,
|
||||
"pre": true,
|
||||
"script": true,
|
||||
"section": true,
|
||||
"select": true,
|
||||
"source": true,
|
||||
"style": true,
|
||||
"summary": true,
|
||||
"table": true,
|
||||
"tbody": true,
|
||||
"td": true,
|
||||
"template": true,
|
||||
"textarea": true,
|
||||
"tfoot": true,
|
||||
"th": true,
|
||||
"thead": true,
|
||||
"title": true,
|
||||
"tr": true,
|
||||
"track": true,
|
||||
"ul": true,
|
||||
"wbr": true,
|
||||
"xmp": true,
|
||||
}
|
||||
|
||||
func isSpecialElement(element *Node) bool {
|
||||
switch element.Namespace {
|
||||
case "", "html":
|
||||
return isSpecialElementMap[element.Data]
|
||||
case "svg":
|
||||
return element.Data == "foreignObject"
|
||||
}
|
||||
return false
|
||||
}
|
||||
+106
@@ -0,0 +1,106 @@
|
||||
// Copyright 2010 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
/*
|
||||
Package html implements an HTML5-compliant tokenizer and parser.
|
||||
|
||||
Tokenization is done by creating a Tokenizer for an io.Reader r. It is the
|
||||
caller's responsibility to ensure that r provides UTF-8 encoded HTML.
|
||||
|
||||
z := html.NewTokenizer(r)
|
||||
|
||||
Given a Tokenizer z, the HTML is tokenized by repeatedly calling z.Next(),
|
||||
which parses the next token and returns its type, or an error:
|
||||
|
||||
for {
|
||||
tt := z.Next()
|
||||
if tt == html.ErrorToken {
|
||||
// ...
|
||||
return ...
|
||||
}
|
||||
// Process the current token.
|
||||
}
|
||||
|
||||
There are two APIs for retrieving the current token. The high-level API is to
|
||||
call Token; the low-level API is to call Text or TagName / TagAttr. Both APIs
|
||||
allow optionally calling Raw after Next but before Token, Text, TagName, or
|
||||
TagAttr. In EBNF notation, the valid call sequence per token is:
|
||||
|
||||
Next {Raw} [ Token | Text | TagName {TagAttr} ]
|
||||
|
||||
Token returns an independent data structure that completely describes a token.
|
||||
Entities (such as "<") are unescaped, tag names and attribute keys are
|
||||
lower-cased, and attributes are collected into a []Attribute. For example:
|
||||
|
||||
for {
|
||||
if z.Next() == html.ErrorToken {
|
||||
// Returning io.EOF indicates success.
|
||||
return z.Err()
|
||||
}
|
||||
emitToken(z.Token())
|
||||
}
|
||||
|
||||
The low-level API performs fewer allocations and copies, but the contents of
|
||||
the []byte values returned by Text, TagName and TagAttr may change on the next
|
||||
call to Next. For example, to extract an HTML page's anchor text:
|
||||
|
||||
depth := 0
|
||||
for {
|
||||
tt := z.Next()
|
||||
switch tt {
|
||||
case ErrorToken:
|
||||
return z.Err()
|
||||
case TextToken:
|
||||
if depth > 0 {
|
||||
// emitBytes should copy the []byte it receives,
|
||||
// if it doesn't process it immediately.
|
||||
emitBytes(z.Text())
|
||||
}
|
||||
case StartTagToken, EndTagToken:
|
||||
tn, _ := z.TagName()
|
||||
if len(tn) == 1 && tn[0] == 'a' {
|
||||
if tt == StartTagToken {
|
||||
depth++
|
||||
} else {
|
||||
depth--
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Parsing is done by calling Parse with an io.Reader, which returns the root of
|
||||
the parse tree (the document element) as a *Node. It is the caller's
|
||||
responsibility to ensure that the Reader provides UTF-8 encoded HTML. For
|
||||
example, to process each anchor node in depth-first order:
|
||||
|
||||
doc, err := html.Parse(r)
|
||||
if err != nil {
|
||||
// ...
|
||||
}
|
||||
var f func(*html.Node)
|
||||
f = func(n *html.Node) {
|
||||
if n.Type == html.ElementNode && n.Data == "a" {
|
||||
// Do something with n...
|
||||
}
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
f(c)
|
||||
}
|
||||
}
|
||||
f(doc)
|
||||
|
||||
The relevant specifications include:
|
||||
https://html.spec.whatwg.org/multipage/syntax.html and
|
||||
https://html.spec.whatwg.org/multipage/syntax.html#tokenization
|
||||
*/
|
||||
package html // import "golang.org/x/net/html"
|
||||
|
||||
// The tokenization algorithm implemented by this package is not a line-by-line
|
||||
// transliteration of the relatively verbose state-machine in the WHATWG
|
||||
// specification. A more direct approach is used instead, where the program
|
||||
// counter implies the state, such as whether it is tokenizing a tag or a text
|
||||
// node. Specification compliance is verified by checking expected and actual
|
||||
// outputs over a test suite rather than aiming for algorithmic fidelity.
|
||||
|
||||
// TODO(nigeltao): Does a DOM API belong in this package or a separate one?
|
||||
// TODO(nigeltao): How does parsing interact with a JavaScript engine?
|
||||
+156
@@ -0,0 +1,156 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package html
|
||||
|
||||
import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
// parseDoctype parses the data from a DoctypeToken into a name,
|
||||
// public identifier, and system identifier. It returns a Node whose Type
|
||||
// is DoctypeNode, whose Data is the name, and which has attributes
|
||||
// named "system" and "public" for the two identifiers if they were present.
|
||||
// quirks is whether the document should be parsed in "quirks mode".
|
||||
func parseDoctype(s string) (n *Node, quirks bool) {
|
||||
n = &Node{Type: DoctypeNode}
|
||||
|
||||
// Find the name.
|
||||
space := strings.IndexAny(s, whitespace)
|
||||
if space == -1 {
|
||||
space = len(s)
|
||||
}
|
||||
n.Data = s[:space]
|
||||
// The comparison to "html" is case-sensitive.
|
||||
if n.Data != "html" {
|
||||
quirks = true
|
||||
}
|
||||
n.Data = strings.ToLower(n.Data)
|
||||
s = strings.TrimLeft(s[space:], whitespace)
|
||||
|
||||
if len(s) < 6 {
|
||||
// It can't start with "PUBLIC" or "SYSTEM".
|
||||
// Ignore the rest of the string.
|
||||
return n, quirks || s != ""
|
||||
}
|
||||
|
||||
key := strings.ToLower(s[:6])
|
||||
s = s[6:]
|
||||
for key == "public" || key == "system" {
|
||||
s = strings.TrimLeft(s, whitespace)
|
||||
if s == "" {
|
||||
break
|
||||
}
|
||||
quote := s[0]
|
||||
if quote != '"' && quote != '\'' {
|
||||
break
|
||||
}
|
||||
s = s[1:]
|
||||
q := strings.IndexRune(s, rune(quote))
|
||||
var id string
|
||||
if q == -1 {
|
||||
id = s
|
||||
s = ""
|
||||
} else {
|
||||
id = s[:q]
|
||||
s = s[q+1:]
|
||||
}
|
||||
n.Attr = append(n.Attr, Attribute{Key: key, Val: id})
|
||||
if key == "public" {
|
||||
key = "system"
|
||||
} else {
|
||||
key = ""
|
||||
}
|
||||
}
|
||||
|
||||
if key != "" || s != "" {
|
||||
quirks = true
|
||||
} else if len(n.Attr) > 0 {
|
||||
if n.Attr[0].Key == "public" {
|
||||
public := strings.ToLower(n.Attr[0].Val)
|
||||
switch public {
|
||||
case "-//w3o//dtd w3 html strict 3.0//en//", "-/w3d/dtd html 4.0 transitional/en", "html":
|
||||
quirks = true
|
||||
default:
|
||||
for _, q := range quirkyIDs {
|
||||
if strings.HasPrefix(public, q) {
|
||||
quirks = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
// The following two public IDs only cause quirks mode if there is no system ID.
|
||||
if len(n.Attr) == 1 && (strings.HasPrefix(public, "-//w3c//dtd html 4.01 frameset//") ||
|
||||
strings.HasPrefix(public, "-//w3c//dtd html 4.01 transitional//")) {
|
||||
quirks = true
|
||||
}
|
||||
}
|
||||
if lastAttr := n.Attr[len(n.Attr)-1]; lastAttr.Key == "system" &&
|
||||
strings.ToLower(lastAttr.Val) == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd" {
|
||||
quirks = true
|
||||
}
|
||||
}
|
||||
|
||||
return n, quirks
|
||||
}
|
||||
|
||||
// quirkyIDs is a list of public doctype identifiers that cause a document
|
||||
// to be interpreted in quirks mode. The identifiers should be in lower case.
|
||||
var quirkyIDs = []string{
|
||||
"+//silmaril//dtd html pro v0r11 19970101//",
|
||||
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
|
||||
"-//as//dtd html 3.0 aswedit + extensions//",
|
||||
"-//ietf//dtd html 2.0 level 1//",
|
||||
"-//ietf//dtd html 2.0 level 2//",
|
||||
"-//ietf//dtd html 2.0 strict level 1//",
|
||||
"-//ietf//dtd html 2.0 strict level 2//",
|
||||
"-//ietf//dtd html 2.0 strict//",
|
||||
"-//ietf//dtd html 2.0//",
|
||||
"-//ietf//dtd html 2.1e//",
|
||||
"-//ietf//dtd html 3.0//",
|
||||
"-//ietf//dtd html 3.2 final//",
|
||||
"-//ietf//dtd html 3.2//",
|
||||
"-//ietf//dtd html 3//",
|
||||
"-//ietf//dtd html level 0//",
|
||||
"-//ietf//dtd html level 1//",
|
||||
"-//ietf//dtd html level 2//",
|
||||
"-//ietf//dtd html level 3//",
|
||||
"-//ietf//dtd html strict level 0//",
|
||||
"-//ietf//dtd html strict level 1//",
|
||||
"-//ietf//dtd html strict level 2//",
|
||||
"-//ietf//dtd html strict level 3//",
|
||||
"-//ietf//dtd html strict//",
|
||||
"-//ietf//dtd html//",
|
||||
"-//metrius//dtd metrius presentational//",
|
||||
"-//microsoft//dtd internet explorer 2.0 html strict//",
|
||||
"-//microsoft//dtd internet explorer 2.0 html//",
|
||||
"-//microsoft//dtd internet explorer 2.0 tables//",
|
||||
"-//microsoft//dtd internet explorer 3.0 html strict//",
|
||||
"-//microsoft//dtd internet explorer 3.0 html//",
|
||||
"-//microsoft//dtd internet explorer 3.0 tables//",
|
||||
"-//netscape comm. corp.//dtd html//",
|
||||
"-//netscape comm. corp.//dtd strict html//",
|
||||
"-//o'reilly and associates//dtd html 2.0//",
|
||||
"-//o'reilly and associates//dtd html extended 1.0//",
|
||||
"-//o'reilly and associates//dtd html extended relaxed 1.0//",
|
||||
"-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
|
||||
"-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
|
||||
"-//spyglass//dtd html 2.0 extended//",
|
||||
"-//sq//dtd html 2.0 hotmetal + extensions//",
|
||||
"-//sun microsystems corp.//dtd hotjava html//",
|
||||
"-//sun microsystems corp.//dtd hotjava strict html//",
|
||||
"-//w3c//dtd html 3 1995-03-24//",
|
||||
"-//w3c//dtd html 3.2 draft//",
|
||||
"-//w3c//dtd html 3.2 final//",
|
||||
"-//w3c//dtd html 3.2//",
|
||||
"-//w3c//dtd html 3.2s draft//",
|
||||
"-//w3c//dtd html 4.0 frameset//",
|
||||
"-//w3c//dtd html 4.0 transitional//",
|
||||
"-//w3c//dtd html experimental 19960712//",
|
||||
"-//w3c//dtd html experimental 970421//",
|
||||
"-//w3c//dtd w3 html//",
|
||||
"-//w3o//dtd w3 html 3.0//",
|
||||
"-//webtechs//dtd mozilla html 2.0//",
|
||||
"-//webtechs//dtd mozilla html//",
|
||||
}
|
||||
+2253
File diff suppressed because it is too large
Load Diff
+258
@@ -0,0 +1,258 @@
|
||||
// Copyright 2010 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package html
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// These replacements permit compatibility with old numeric entities that
|
||||
// assumed Windows-1252 encoding.
|
||||
// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
|
||||
var replacementTable = [...]rune{
|
||||
'\u20AC', // First entry is what 0x80 should be replaced with.
|
||||
'\u0081',
|
||||
'\u201A',
|
||||
'\u0192',
|
||||
'\u201E',
|
||||
'\u2026',
|
||||
'\u2020',
|
||||
'\u2021',
|
||||
'\u02C6',
|
||||
'\u2030',
|
||||
'\u0160',
|
||||
'\u2039',
|
||||
'\u0152',
|
||||
'\u008D',
|
||||
'\u017D',
|
||||
'\u008F',
|
||||
'\u0090',
|
||||
'\u2018',
|
||||
'\u2019',
|
||||
'\u201C',
|
||||
'\u201D',
|
||||
'\u2022',
|
||||
'\u2013',
|
||||
'\u2014',
|
||||
'\u02DC',
|
||||
'\u2122',
|
||||
'\u0161',
|
||||
'\u203A',
|
||||
'\u0153',
|
||||
'\u009D',
|
||||
'\u017E',
|
||||
'\u0178', // Last entry is 0x9F.
|
||||
// 0x00->'\uFFFD' is handled programmatically.
|
||||
// 0x0D->'\u000D' is a no-op.
|
||||
}
|
||||
|
||||
// unescapeEntity reads an entity like "<" from b[src:] and writes the
|
||||
// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
|
||||
// Precondition: b[src] == '&' && dst <= src.
|
||||
// attribute should be true if parsing an attribute value.
|
||||
func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
|
||||
// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
|
||||
|
||||
// i starts at 1 because we already know that s[0] == '&'.
|
||||
i, s := 1, b[src:]
|
||||
|
||||
if len(s) <= 1 {
|
||||
b[dst] = b[src]
|
||||
return dst + 1, src + 1
|
||||
}
|
||||
|
||||
if s[i] == '#' {
|
||||
if len(s) <= 3 { // We need to have at least "&#.".
|
||||
b[dst] = b[src]
|
||||
return dst + 1, src + 1
|
||||
}
|
||||
i++
|
||||
c := s[i]
|
||||
hex := false
|
||||
if c == 'x' || c == 'X' {
|
||||
hex = true
|
||||
i++
|
||||
}
|
||||
|
||||
x := '\x00'
|
||||
for i < len(s) {
|
||||
c = s[i]
|
||||
i++
|
||||
if hex {
|
||||
if '0' <= c && c <= '9' {
|
||||
x = 16*x + rune(c) - '0'
|
||||
continue
|
||||
} else if 'a' <= c && c <= 'f' {
|
||||
x = 16*x + rune(c) - 'a' + 10
|
||||
continue
|
||||
} else if 'A' <= c && c <= 'F' {
|
||||
x = 16*x + rune(c) - 'A' + 10
|
||||
continue
|
||||
}
|
||||
} else if '0' <= c && c <= '9' {
|
||||
x = 10*x + rune(c) - '0'
|
||||
continue
|
||||
}
|
||||
if c != ';' {
|
||||
i--
|
||||
}
|
||||
break
|
||||
}
|
||||
|
||||
if i <= 3 { // No characters matched.
|
||||
b[dst] = b[src]
|
||||
return dst + 1, src + 1
|
||||
}
|
||||
|
||||
if 0x80 <= x && x <= 0x9F {
|
||||
// Replace characters from Windows-1252 with UTF-8 equivalents.
|
||||
x = replacementTable[x-0x80]
|
||||
} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
|
||||
// Replace invalid characters with the replacement character.
|
||||
x = '\uFFFD'
|
||||
}
|
||||
|
||||
return dst + utf8.EncodeRune(b[dst:], x), src + i
|
||||
}
|
||||
|
||||
// Consume the maximum number of characters possible, with the
|
||||
// consumed characters matching one of the named references.
|
||||
|
||||
for i < len(s) {
|
||||
c := s[i]
|
||||
i++
|
||||
// Lower-cased characters are more common in entities, so we check for them first.
|
||||
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
|
||||
continue
|
||||
}
|
||||
if c != ';' {
|
||||
i--
|
||||
}
|
||||
break
|
||||
}
|
||||
|
||||
entityName := string(s[1:i])
|
||||
if entityName == "" {
|
||||
// No-op.
|
||||
} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
|
||||
// No-op.
|
||||
} else if x := entity[entityName]; x != 0 {
|
||||
return dst + utf8.EncodeRune(b[dst:], x), src + i
|
||||
} else if x := entity2[entityName]; x[0] != 0 {
|
||||
dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
|
||||
return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
|
||||
} else if !attribute {
|
||||
maxLen := len(entityName) - 1
|
||||
if maxLen > longestEntityWithoutSemicolon {
|
||||
maxLen = longestEntityWithoutSemicolon
|
||||
}
|
||||
for j := maxLen; j > 1; j-- {
|
||||
if x := entity[entityName[:j]]; x != 0 {
|
||||
return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dst1, src1 = dst+i, src+i
|
||||
copy(b[dst:dst1], b[src:src1])
|
||||
return dst1, src1
|
||||
}
|
||||
|
||||
// unescape unescapes b's entities in-place, so that "a<b" becomes "a<b".
|
||||
// attribute should be true if parsing an attribute value.
|
||||
func unescape(b []byte, attribute bool) []byte {
|
||||
for i, c := range b {
|
||||
if c == '&' {
|
||||
dst, src := unescapeEntity(b, i, i, attribute)
|
||||
for src < len(b) {
|
||||
c := b[src]
|
||||
if c == '&' {
|
||||
dst, src = unescapeEntity(b, dst, src, attribute)
|
||||
} else {
|
||||
b[dst] = c
|
||||
dst, src = dst+1, src+1
|
||||
}
|
||||
}
|
||||
return b[0:dst]
|
||||
}
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
// lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
|
||||
func lower(b []byte) []byte {
|
||||
for i, c := range b {
|
||||
if 'A' <= c && c <= 'Z' {
|
||||
b[i] = c + 'a' - 'A'
|
||||
}
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
const escapedChars = "&'<>\"\r"
|
||||
|
||||
func escape(w writer, s string) error {
|
||||
i := strings.IndexAny(s, escapedChars)
|
||||
for i != -1 {
|
||||
if _, err := w.WriteString(s[:i]); err != nil {
|
||||
return err
|
||||
}
|
||||
var esc string
|
||||
switch s[i] {
|
||||
case '&':
|
||||
esc = "&"
|
||||
case '\'':
|
||||
// "'" is shorter than "'" and apos was not in HTML until HTML5.
|
||||
esc = "'"
|
||||
case '<':
|
||||
esc = "<"
|
||||
case '>':
|
||||
esc = ">"
|
||||
case '"':
|
||||
// """ is shorter than """.
|
||||
esc = """
|
||||
case '\r':
|
||||
esc = " "
|
||||
default:
|
||||
panic("unrecognized escape character")
|
||||
}
|
||||
s = s[i+1:]
|
||||
if _, err := w.WriteString(esc); err != nil {
|
||||
return err
|
||||
}
|
||||
i = strings.IndexAny(s, escapedChars)
|
||||
}
|
||||
_, err := w.WriteString(s)
|
||||
return err
|
||||
}
|
||||
|
||||
// EscapeString escapes special characters like "<" to become "<". It
|
||||
// escapes only five such characters: <, >, &, ' and ".
|
||||
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
|
||||
// always true.
|
||||
func EscapeString(s string) string {
|
||||
if strings.IndexAny(s, escapedChars) == -1 {
|
||||
return s
|
||||
}
|
||||
var buf bytes.Buffer
|
||||
escape(&buf, s)
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
// UnescapeString unescapes entities like "<" to become "<". It unescapes a
|
||||
// larger range of entities than EscapeString escapes. For example, "á"
|
||||
// unescapes to "á", as does "á" and "&xE1;".
|
||||
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
|
||||
// always true.
|
||||
func UnescapeString(s string) string {
|
||||
for _, c := range s {
|
||||
if c == '&' {
|
||||
return string(unescape([]byte(s), false))
|
||||
}
|
||||
}
|
||||
return s
|
||||
}
|
||||
+226
@@ -0,0 +1,226 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package html
|
||||
|
||||
import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
func adjustAttributeNames(aa []Attribute, nameMap map[string]string) {
|
||||
for i := range aa {
|
||||
if newName, ok := nameMap[aa[i].Key]; ok {
|
||||
aa[i].Key = newName
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func adjustForeignAttributes(aa []Attribute) {
|
||||
for i, a := range aa {
|
||||
if a.Key == "" || a.Key[0] != 'x' {
|
||||
continue
|
||||
}
|
||||
switch a.Key {
|
||||
case "xlink:actuate", "xlink:arcrole", "xlink:href", "xlink:role", "xlink:show",
|
||||
"xlink:title", "xlink:type", "xml:base", "xml:lang", "xml:space", "xmlns:xlink":
|
||||
j := strings.Index(a.Key, ":")
|
||||
aa[i].Namespace = a.Key[:j]
|
||||
aa[i].Key = a.Key[j+1:]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func htmlIntegrationPoint(n *Node) bool {
|
||||
if n.Type != ElementNode {
|
||||
return false
|
||||
}
|
||||
switch n.Namespace {
|
||||
case "math":
|
||||
if n.Data == "annotation-xml" {
|
||||
for _, a := range n.Attr {
|
||||
if a.Key == "encoding" {
|
||||
val := strings.ToLower(a.Val)
|
||||
if val == "text/html" || val == "application/xhtml+xml" {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
case "svg":
|
||||
switch n.Data {
|
||||
case "desc", "foreignObject", "title":
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func mathMLTextIntegrationPoint(n *Node) bool {
|
||||
if n.Namespace != "math" {
|
||||
return false
|
||||
}
|
||||
switch n.Data {
|
||||
case "mi", "mo", "mn", "ms", "mtext":
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// Section 12.2.5.5.
|
||||
var breakout = map[string]bool{
|
||||
"b": true,
|
||||
"big": true,
|
||||
"blockquote": true,
|
||||
"body": true,
|
||||
"br": true,
|
||||
"center": true,
|
||||
"code": true,
|
||||
"dd": true,
|
||||
"div": true,
|
||||
"dl": true,
|
||||
"dt": true,
|
||||
"em": true,
|
||||
"embed": true,
|
||||
"h1": true,
|
||||
"h2": true,
|
||||
"h3": true,
|
||||
"h4": true,
|
||||
"h5": true,
|
||||
"h6": true,
|
||||
"head": true,
|
||||
"hr": true,
|
||||
"i": true,
|
||||
"img": true,
|
||||
"li": true,
|
||||
"listing": true,
|
||||
"menu": true,
|
||||
"meta": true,
|
||||
"nobr": true,
|
||||
"ol": true,
|
||||
"p": true,
|
||||
"pre": true,
|
||||
"ruby": true,
|
||||
"s": true,
|
||||
"small": true,
|
||||
"span": true,
|
||||
"strong": true,
|
||||
"strike": true,
|
||||
"sub": true,
|
||||
"sup": true,
|
||||
"table": true,
|
||||
"tt": true,
|
||||
"u": true,
|
||||
"ul": true,
|
||||
"var": true,
|
||||
}
|
||||
|
||||
// Section 12.2.5.5.
|
||||
var svgTagNameAdjustments = map[string]string{
|
||||
"altglyph": "altGlyph",
|
||||
"altglyphdef": "altGlyphDef",
|
||||
"altglyphitem": "altGlyphItem",
|
||||
"animatecolor": "animateColor",
|
||||
"animatemotion": "animateMotion",
|
||||
"animatetransform": "animateTransform",
|
||||
"clippath": "clipPath",
|
||||
"feblend": "feBlend",
|
||||
"fecolormatrix": "feColorMatrix",
|
||||
"fecomponenttransfer": "feComponentTransfer",
|
||||
"fecomposite": "feComposite",
|
||||
"feconvolvematrix": "feConvolveMatrix",
|
||||
"fediffuselighting": "feDiffuseLighting",
|
||||
"fedisplacementmap": "feDisplacementMap",
|
||||
"fedistantlight": "feDistantLight",
|
||||
"feflood": "feFlood",
|
||||
"fefunca": "feFuncA",
|
||||
"fefuncb": "feFuncB",
|
||||
"fefuncg": "feFuncG",
|
||||
"fefuncr": "feFuncR",
|
||||
"fegaussianblur": "feGaussianBlur",
|
||||
"feimage": "feImage",
|
||||
"femerge": "feMerge",
|
||||
"femergenode": "feMergeNode",
|
||||
"femorphology": "feMorphology",
|
||||
"feoffset": "feOffset",
|
||||
"fepointlight": "fePointLight",
|
||||
"fespecularlighting": "feSpecularLighting",
|
||||
"fespotlight": "feSpotLight",
|
||||
"fetile": "feTile",
|
||||
"feturbulence": "feTurbulence",
|
||||
"foreignobject": "foreignObject",
|
||||
"glyphref": "glyphRef",
|
||||
"lineargradient": "linearGradient",
|
||||
"radialgradient": "radialGradient",
|
||||
"textpath": "textPath",
|
||||
}
|
||||
|
||||
// Section 12.2.5.1
|
||||
var mathMLAttributeAdjustments = map[string]string{
|
||||
"definitionurl": "definitionURL",
|
||||
}
|
||||
|
||||
var svgAttributeAdjustments = map[string]string{
|
||||
"attributename": "attributeName",
|
||||
"attributetype": "attributeType",
|
||||
"basefrequency": "baseFrequency",
|
||||
"baseprofile": "baseProfile",
|
||||
"calcmode": "calcMode",
|
||||
"clippathunits": "clipPathUnits",
|
||||
"contentscripttype": "contentScriptType",
|
||||
"contentstyletype": "contentStyleType",
|
||||
"diffuseconstant": "diffuseConstant",
|
||||
"edgemode": "edgeMode",
|
||||
"externalresourcesrequired": "externalResourcesRequired",
|
||||
"filterres": "filterRes",
|
||||
"filterunits": "filterUnits",
|
||||
"glyphref": "glyphRef",
|
||||
"gradienttransform": "gradientTransform",
|
||||
"gradientunits": "gradientUnits",
|
||||
"kernelmatrix": "kernelMatrix",
|
||||
"kernelunitlength": "kernelUnitLength",
|
||||
"keypoints": "keyPoints",
|
||||
"keysplines": "keySplines",
|
||||
"keytimes": "keyTimes",
|
||||
"lengthadjust": "lengthAdjust",
|
||||
"limitingconeangle": "limitingConeAngle",
|
||||
"markerheight": "markerHeight",
|
||||
"markerunits": "markerUnits",
|
||||
"markerwidth": "markerWidth",
|
||||
"maskcontentunits": "maskContentUnits",
|
||||
"maskunits": "maskUnits",
|
||||
"numoctaves": "numOctaves",
|
||||
"pathlength": "pathLength",
|
||||
"patterncontentunits": "patternContentUnits",
|
||||
"patterntransform": "patternTransform",
|
||||
"patternunits": "patternUnits",
|
||||
"pointsatx": "pointsAtX",
|
||||
"pointsaty": "pointsAtY",
|
||||
"pointsatz": "pointsAtZ",
|
||||
"preservealpha": "preserveAlpha",
|
||||
"preserveaspectratio": "preserveAspectRatio",
|
||||
"primitiveunits": "primitiveUnits",
|
||||
"refx": "refX",
|
||||
"refy": "refY",
|
||||
"repeatcount": "repeatCount",
|
||||
"repeatdur": "repeatDur",
|
||||
"requiredextensions": "requiredExtensions",
|
||||
"requiredfeatures": "requiredFeatures",
|
||||
"specularconstant": "specularConstant",
|
||||
"specularexponent": "specularExponent",
|
||||
"spreadmethod": "spreadMethod",
|
||||
"startoffset": "startOffset",
|
||||
"stddeviation": "stdDeviation",
|
||||
"stitchtiles": "stitchTiles",
|
||||
"surfacescale": "surfaceScale",
|
||||
"systemlanguage": "systemLanguage",
|
||||
"tablevalues": "tableValues",
|
||||
"targetx": "targetX",
|
||||
"targety": "targetY",
|
||||
"textlength": "textLength",
|
||||
"viewbox": "viewBox",
|
||||
"viewtarget": "viewTarget",
|
||||
"xchannelselector": "xChannelSelector",
|
||||
"ychannelselector": "yChannelSelector",
|
||||
"zoomandpan": "zoomAndPan",
|
||||
}
|
||||
+193
@@ -0,0 +1,193 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package html
|
||||
|
||||
import (
|
||||
"golang.org/x/net/html/atom"
|
||||
)
|
||||
|
||||
// A NodeType is the type of a Node.
|
||||
type NodeType uint32
|
||||
|
||||
const (
|
||||
ErrorNode NodeType = iota
|
||||
TextNode
|
||||
DocumentNode
|
||||
ElementNode
|
||||
CommentNode
|
||||
DoctypeNode
|
||||
scopeMarkerNode
|
||||
)
|
||||
|
||||
// Section 12.2.3.3 says "scope markers are inserted when entering applet
|
||||
// elements, buttons, object elements, marquees, table cells, and table
|
||||
// captions, and are used to prevent formatting from 'leaking'".
|
||||
var scopeMarker = Node{Type: scopeMarkerNode}
|
||||
|
||||
// A Node consists of a NodeType and some Data (tag name for element nodes,
|
||||
// content for text) and are part of a tree of Nodes. Element nodes may also
|
||||
// have a Namespace and contain a slice of Attributes. Data is unescaped, so
|
||||
// that it looks like "a<b" rather than "a<b". For element nodes, DataAtom
|
||||
// is the atom for Data, or zero if Data is not a known tag name.
|
||||
//
|
||||
// An empty Namespace implies a "http://www.w3.org/1999/xhtml" namespace.
|
||||
// Similarly, "math" is short for "http://www.w3.org/1998/Math/MathML", and
|
||||
// "svg" is short for "http://www.w3.org/2000/svg".
|
||||
type Node struct {
|
||||
Parent, FirstChild, LastChild, PrevSibling, NextSibling *Node
|
||||
|
||||
Type NodeType
|
||||
DataAtom atom.Atom
|
||||
Data string
|
||||
Namespace string
|
||||
Attr []Attribute
|
||||
}
|
||||
|
||||
// InsertBefore inserts newChild as a child of n, immediately before oldChild
|
||||
// in the sequence of n's children. oldChild may be nil, in which case newChild
|
||||
// is appended to the end of n's children.
|
||||
//
|
||||
// It will panic if newChild already has a parent or siblings.
|
||||
func (n *Node) InsertBefore(newChild, oldChild *Node) {
|
||||
if newChild.Parent != nil || newChild.PrevSibling != nil || newChild.NextSibling != nil {
|
||||
panic("html: InsertBefore called for an attached child Node")
|
||||
}
|
||||
var prev, next *Node
|
||||
if oldChild != nil {
|
||||
prev, next = oldChild.PrevSibling, oldChild
|
||||
} else {
|
||||
prev = n.LastChild
|
||||
}
|
||||
if prev != nil {
|
||||
prev.NextSibling = newChild
|
||||
} else {
|
||||
n.FirstChild = newChild
|
||||
}
|
||||
if next != nil {
|
||||
next.PrevSibling = newChild
|
||||
} else {
|
||||
n.LastChild = newChild
|
||||
}
|
||||
newChild.Parent = n
|
||||
newChild.PrevSibling = prev
|
||||
newChild.NextSibling = next
|
||||
}
|
||||
|
||||
// AppendChild adds a node c as a child of n.
|
||||
//
|
||||
// It will panic if c already has a parent or siblings.
|
||||
func (n *Node) AppendChild(c *Node) {
|
||||
if c.Parent != nil || c.PrevSibling != nil || c.NextSibling != nil {
|
||||
panic("html: AppendChild called for an attached child Node")
|
||||
}
|
||||
last := n.LastChild
|
||||
if last != nil {
|
||||
last.NextSibling = c
|
||||
} else {
|
||||
n.FirstChild = c
|
||||
}
|
||||
n.LastChild = c
|
||||
c.Parent = n
|
||||
c.PrevSibling = last
|
||||
}
|
||||
|
||||
// RemoveChild removes a node c that is a child of n. Afterwards, c will have
|
||||
// no parent and no siblings.
|
||||
//
|
||||
// It will panic if c's parent is not n.
|
||||
func (n *Node) RemoveChild(c *Node) {
|
||||
if c.Parent != n {
|
||||
panic("html: RemoveChild called for a non-child Node")
|
||||
}
|
||||
if n.FirstChild == c {
|
||||
n.FirstChild = c.NextSibling
|
||||
}
|
||||
if c.NextSibling != nil {
|
||||
c.NextSibling.PrevSibling = c.PrevSibling
|
||||
}
|
||||
if n.LastChild == c {
|
||||
n.LastChild = c.PrevSibling
|
||||
}
|
||||
if c.PrevSibling != nil {
|
||||
c.PrevSibling.NextSibling = c.NextSibling
|
||||
}
|
||||
c.Parent = nil
|
||||
c.PrevSibling = nil
|
||||
c.NextSibling = nil
|
||||
}
|
||||
|
||||
// reparentChildren reparents all of src's child nodes to dst.
|
||||
func reparentChildren(dst, src *Node) {
|
||||
for {
|
||||
child := src.FirstChild
|
||||
if child == nil {
|
||||
break
|
||||
}
|
||||
src.RemoveChild(child)
|
||||
dst.AppendChild(child)
|
||||
}
|
||||
}
|
||||
|
||||
// clone returns a new node with the same type, data and attributes.
|
||||
// The clone has no parent, no siblings and no children.
|
||||
func (n *Node) clone() *Node {
|
||||
m := &Node{
|
||||
Type: n.Type,
|
||||
DataAtom: n.DataAtom,
|
||||
Data: n.Data,
|
||||
Attr: make([]Attribute, len(n.Attr)),
|
||||
}
|
||||
copy(m.Attr, n.Attr)
|
||||
return m
|
||||
}
|
||||
|
||||
// nodeStack is a stack of nodes.
|
||||
type nodeStack []*Node
|
||||
|
||||
// pop pops the stack. It will panic if s is empty.
|
||||
func (s *nodeStack) pop() *Node {
|
||||
i := len(*s)
|
||||
n := (*s)[i-1]
|
||||
*s = (*s)[:i-1]
|
||||
return n
|
||||
}
|
||||
|
||||
// top returns the most recently pushed node, or nil if s is empty.
|
||||
func (s *nodeStack) top() *Node {
|
||||
if i := len(*s); i > 0 {
|
||||
return (*s)[i-1]
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// index returns the index of the top-most occurrence of n in the stack, or -1
|
||||
// if n is not present.
|
||||
func (s *nodeStack) index(n *Node) int {
|
||||
for i := len(*s) - 1; i >= 0; i-- {
|
||||
if (*s)[i] == n {
|
||||
return i
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// insert inserts a node at the given index.
|
||||
func (s *nodeStack) insert(i int, n *Node) {
|
||||
(*s) = append(*s, nil)
|
||||
copy((*s)[i+1:], (*s)[i:])
|
||||
(*s)[i] = n
|
||||
}
|
||||
|
||||
// remove removes a node from the stack. It is a no-op if n is not present.
|
||||
func (s *nodeStack) remove(n *Node) {
|
||||
i := s.index(n)
|
||||
if i == -1 {
|
||||
return
|
||||
}
|
||||
copy((*s)[i:], (*s)[i+1:])
|
||||
j := len(*s) - 1
|
||||
(*s)[j] = nil
|
||||
*s = (*s)[:j]
|
||||
}
|
||||
+2094
File diff suppressed because it is too large
Load Diff
+271
@@ -0,0 +1,271 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package html
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type writer interface {
|
||||
io.Writer
|
||||
io.ByteWriter
|
||||
WriteString(string) (int, error)
|
||||
}
|
||||
|
||||
// Render renders the parse tree n to the given writer.
|
||||
//
|
||||
// Rendering is done on a 'best effort' basis: calling Parse on the output of
|
||||
// Render will always result in something similar to the original tree, but it
|
||||
// is not necessarily an exact clone unless the original tree was 'well-formed'.
|
||||
// 'Well-formed' is not easily specified; the HTML5 specification is
|
||||
// complicated.
|
||||
//
|
||||
// Calling Parse on arbitrary input typically results in a 'well-formed' parse
|
||||
// tree. However, it is possible for Parse to yield a 'badly-formed' parse tree.
|
||||
// For example, in a 'well-formed' parse tree, no <a> element is a child of
|
||||
// another <a> element: parsing "<a><a>" results in two sibling elements.
|
||||
// Similarly, in a 'well-formed' parse tree, no <a> element is a child of a
|
||||
// <table> element: parsing "<p><table><a>" results in a <p> with two sibling
|
||||
// children; the <a> is reparented to the <table>'s parent. However, calling
|
||||
// Parse on "<a><table><a>" does not return an error, but the result has an <a>
|
||||
// element with an <a> child, and is therefore not 'well-formed'.
|
||||
//
|
||||
// Programmatically constructed trees are typically also 'well-formed', but it
|
||||
// is possible to construct a tree that looks innocuous but, when rendered and
|
||||
// re-parsed, results in a different tree. A simple example is that a solitary
|
||||
// text node would become a tree containing <html>, <head> and <body> elements.
|
||||
// Another example is that the programmatic equivalent of "a<head>b</head>c"
|
||||
// becomes "<html><head><head/><body>abc</body></html>".
|
||||
func Render(w io.Writer, n *Node) error {
|
||||
if x, ok := w.(writer); ok {
|
||||
return render(x, n)
|
||||
}
|
||||
buf := bufio.NewWriter(w)
|
||||
if err := render(buf, n); err != nil {
|
||||
return err
|
||||
}
|
||||
return buf.Flush()
|
||||
}
|
||||
|
||||
// plaintextAbort is returned from render1 when a <plaintext> element
|
||||
// has been rendered. No more end tags should be rendered after that.
|
||||
var plaintextAbort = errors.New("html: internal error (plaintext abort)")
|
||||
|
||||
func render(w writer, n *Node) error {
|
||||
err := render1(w, n)
|
||||
if err == plaintextAbort {
|
||||
err = nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
func render1(w writer, n *Node) error {
|
||||
// Render non-element nodes; these are the easy cases.
|
||||
switch n.Type {
|
||||
case ErrorNode:
|
||||
return errors.New("html: cannot render an ErrorNode node")
|
||||
case TextNode:
|
||||
return escape(w, n.Data)
|
||||
case DocumentNode:
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
if err := render1(w, c); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
case ElementNode:
|
||||
// No-op.
|
||||
case CommentNode:
|
||||
if _, err := w.WriteString("<!--"); err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := w.WriteString(n.Data); err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := w.WriteString("-->"); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
case DoctypeNode:
|
||||
if _, err := w.WriteString("<!DOCTYPE "); err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := w.WriteString(n.Data); err != nil {
|
||||
return err
|
||||
}
|
||||
if n.Attr != nil {
|
||||
var p, s string
|
||||
for _, a := range n.Attr {
|
||||
switch a.Key {
|
||||
case "public":
|
||||
p = a.Val
|
||||
case "system":
|
||||
s = a.Val
|
||||
}
|
||||
}
|
||||
if p != "" {
|
||||
if _, err := w.WriteString(" PUBLIC "); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := writeQuoted(w, p); err != nil {
|
||||
return err
|
||||
}
|
||||
if s != "" {
|
||||
if err := w.WriteByte(' '); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := writeQuoted(w, s); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
} else if s != "" {
|
||||
if _, err := w.WriteString(" SYSTEM "); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := writeQuoted(w, s); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return w.WriteByte('>')
|
||||
default:
|
||||
return errors.New("html: unknown node type")
|
||||
}
|
||||
|
||||
// Render the <xxx> opening tag.
|
||||
if err := w.WriteByte('<'); err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := w.WriteString(n.Data); err != nil {
|
||||
return err
|
||||
}
|
||||
for _, a := range n.Attr {
|
||||
if err := w.WriteByte(' '); err != nil {
|
||||
return err
|
||||
}
|
||||
if a.Namespace != "" {
|
||||
if _, err := w.WriteString(a.Namespace); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := w.WriteByte(':'); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if _, err := w.WriteString(a.Key); err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := w.WriteString(`="`); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := escape(w, a.Val); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := w.WriteByte('"'); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if voidElements[n.Data] {
|
||||
if n.FirstChild != nil {
|
||||
return fmt.Errorf("html: void element <%s> has child nodes", n.Data)
|
||||
}
|
||||
_, err := w.WriteString("/>")
|
||||
return err
|
||||
}
|
||||
if err := w.WriteByte('>'); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Add initial newline where there is danger of a newline beging ignored.
|
||||
if c := n.FirstChild; c != nil && c.Type == TextNode && strings.HasPrefix(c.Data, "\n") {
|
||||
switch n.Data {
|
||||
case "pre", "listing", "textarea":
|
||||
if err := w.WriteByte('\n'); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Render any child nodes.
|
||||
switch n.Data {
|
||||
case "iframe", "noembed", "noframes", "noscript", "plaintext", "script", "style", "xmp":
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
if c.Type == TextNode {
|
||||
if _, err := w.WriteString(c.Data); err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
if err := render1(w, c); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
if n.Data == "plaintext" {
|
||||
// Don't render anything else. <plaintext> must be the
|
||||
// last element in the file, with no closing tag.
|
||||
return plaintextAbort
|
||||
}
|
||||
default:
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
if err := render1(w, c); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Render the </xxx> closing tag.
|
||||
if _, err := w.WriteString("</"); err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := w.WriteString(n.Data); err != nil {
|
||||
return err
|
||||
}
|
||||
return w.WriteByte('>')
|
||||
}
|
||||
|
||||
// writeQuoted writes s to w surrounded by quotes. Normally it will use double
|
||||
// quotes, but if s contains a double quote, it will use single quotes.
|
||||
// It is used for writing the identifiers in a doctype declaration.
|
||||
// In valid HTML, they can't contain both types of quotes.
|
||||
func writeQuoted(w writer, s string) error {
|
||||
var q byte = '"'
|
||||
if strings.Contains(s, `"`) {
|
||||
q = '\''
|
||||
}
|
||||
if err := w.WriteByte(q); err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := w.WriteString(s); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := w.WriteByte(q); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Section 12.1.2, "Elements", gives this list of void elements. Void elements
|
||||
// are those that can't have any contents.
|
||||
var voidElements = map[string]bool{
|
||||
"area": true,
|
||||
"base": true,
|
||||
"br": true,
|
||||
"col": true,
|
||||
"command": true,
|
||||
"embed": true,
|
||||
"hr": true,
|
||||
"img": true,
|
||||
"input": true,
|
||||
"keygen": true,
|
||||
"link": true,
|
||||
"meta": true,
|
||||
"param": true,
|
||||
"source": true,
|
||||
"track": true,
|
||||
"wbr": true,
|
||||
}
|
||||
+1219
File diff suppressed because it is too large
Load Diff
+27
@@ -0,0 +1,27 @@
|
||||
Copyright (c) 2009 The Go Authors. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Google Inc. nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
+209
@@ -0,0 +1,209 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:generate go run maketables.go
|
||||
|
||||
// Package charmap provides simple character encodings such as IBM Code Page 437
|
||||
// and Windows 1252.
|
||||
package charmap // import "golang.org/x/text/encoding/charmap"
|
||||
|
||||
import (
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/encoding"
|
||||
"golang.org/x/text/encoding/internal"
|
||||
"golang.org/x/text/encoding/internal/identifier"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// These encodings vary only in the way clients should interpret them. Their
|
||||
// coded character set is identical and a single implementation can be shared.
|
||||
var (
|
||||
// ISO8859_6E is the ISO 8859-6E encoding.
|
||||
ISO8859_6E encoding.Encoding = &iso8859_6E
|
||||
|
||||
// ISO8859_6I is the ISO 8859-6I encoding.
|
||||
ISO8859_6I encoding.Encoding = &iso8859_6I
|
||||
|
||||
// ISO8859_8E is the ISO 8859-8E encoding.
|
||||
ISO8859_8E encoding.Encoding = &iso8859_8E
|
||||
|
||||
// ISO8859_8I is the ISO 8859-8I encoding.
|
||||
ISO8859_8I encoding.Encoding = &iso8859_8I
|
||||
|
||||
iso8859_6E = internal.Encoding{
|
||||
ISO8859_6,
|
||||
"ISO-8859-6E",
|
||||
identifier.ISO88596E,
|
||||
}
|
||||
|
||||
iso8859_6I = internal.Encoding{
|
||||
ISO8859_6,
|
||||
"ISO-8859-6I",
|
||||
identifier.ISO88596I,
|
||||
}
|
||||
|
||||
iso8859_8E = internal.Encoding{
|
||||
ISO8859_8,
|
||||
"ISO-8859-8E",
|
||||
identifier.ISO88598E,
|
||||
}
|
||||
|
||||
iso8859_8I = internal.Encoding{
|
||||
ISO8859_8,
|
||||
"ISO-8859-8I",
|
||||
identifier.ISO88598I,
|
||||
}
|
||||
)
|
||||
|
||||
// All is a list of all defined encodings in this package.
|
||||
var All = listAll
|
||||
|
||||
// TODO: implement these encodings, in order of importance.
|
||||
// ASCII, ISO8859_1: Rather common. Close to Windows 1252.
|
||||
// ISO8859_9: Close to Windows 1254.
|
||||
|
||||
// utf8Enc holds a rune's UTF-8 encoding in data[:len].
|
||||
type utf8Enc struct {
|
||||
len uint8
|
||||
data [3]byte
|
||||
}
|
||||
|
||||
// charmap describes an 8-bit character set encoding.
|
||||
type charmap struct {
|
||||
// name is the encoding's name.
|
||||
name string
|
||||
// mib is the encoding type of this encoder.
|
||||
mib identifier.MIB
|
||||
// asciiSuperset states whether the encoding is a superset of ASCII.
|
||||
asciiSuperset bool
|
||||
// low is the lower bound of the encoded byte for a non-ASCII rune. If
|
||||
// charmap.asciiSuperset is true then this will be 0x80, otherwise 0x00.
|
||||
low uint8
|
||||
// replacement is the encoded replacement character.
|
||||
replacement byte
|
||||
// decode is the map from encoded byte to UTF-8.
|
||||
decode [256]utf8Enc
|
||||
// encoding is the map from runes to encoded bytes. Each entry is a
|
||||
// uint32: the high 8 bits are the encoded byte and the low 24 bits are
|
||||
// the rune. The table entries are sorted by ascending rune.
|
||||
encode [256]uint32
|
||||
}
|
||||
|
||||
func (m *charmap) NewDecoder() *encoding.Decoder {
|
||||
return &encoding.Decoder{Transformer: charmapDecoder{charmap: m}}
|
||||
}
|
||||
|
||||
func (m *charmap) NewEncoder() *encoding.Encoder {
|
||||
return &encoding.Encoder{Transformer: charmapEncoder{charmap: m}}
|
||||
}
|
||||
|
||||
func (m *charmap) String() string {
|
||||
return m.name
|
||||
}
|
||||
|
||||
func (m *charmap) ID() (mib identifier.MIB, other string) {
|
||||
return m.mib, ""
|
||||
}
|
||||
|
||||
// charmapDecoder implements transform.Transformer by decoding to UTF-8.
|
||||
type charmapDecoder struct {
|
||||
transform.NopResetter
|
||||
charmap *charmap
|
||||
}
|
||||
|
||||
func (m charmapDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
for i, c := range src {
|
||||
if m.charmap.asciiSuperset && c < utf8.RuneSelf {
|
||||
if nDst >= len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst] = c
|
||||
nDst++
|
||||
nSrc = i + 1
|
||||
continue
|
||||
}
|
||||
|
||||
decode := &m.charmap.decode[c]
|
||||
n := int(decode.len)
|
||||
if nDst+n > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
// It's 15% faster to avoid calling copy for these tiny slices.
|
||||
for j := 0; j < n; j++ {
|
||||
dst[nDst] = decode.data[j]
|
||||
nDst++
|
||||
}
|
||||
nSrc = i + 1
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
// charmapEncoder implements transform.Transformer by encoding from UTF-8.
|
||||
type charmapEncoder struct {
|
||||
transform.NopResetter
|
||||
charmap *charmap
|
||||
}
|
||||
|
||||
func (m charmapEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
loop:
|
||||
for nSrc < len(src) {
|
||||
if nDst >= len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
r = rune(src[nSrc])
|
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf {
|
||||
if m.charmap.asciiSuperset {
|
||||
nSrc++
|
||||
dst[nDst] = uint8(r)
|
||||
nDst++
|
||||
continue
|
||||
}
|
||||
size = 1
|
||||
|
||||
} else {
|
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:])
|
||||
if size == 1 {
|
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
} else {
|
||||
err = internal.RepertoireError(m.charmap.replacement)
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Binary search in [low, high) for that rune in the m.charmap.encode table.
|
||||
for low, high := int(m.charmap.low), 0x100; ; {
|
||||
if low >= high {
|
||||
err = internal.RepertoireError(m.charmap.replacement)
|
||||
break loop
|
||||
}
|
||||
mid := (low + high) / 2
|
||||
got := m.charmap.encode[mid]
|
||||
gotRune := rune(got & (1<<24 - 1))
|
||||
if gotRune < r {
|
||||
low = mid + 1
|
||||
} else if gotRune > r {
|
||||
high = mid
|
||||
} else {
|
||||
dst[nDst] = byte(got >> 24)
|
||||
nDst++
|
||||
break
|
||||
}
|
||||
}
|
||||
nSrc += size
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
+556
@@ -0,0 +1,556 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"sort"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/encoding"
|
||||
"golang.org/x/text/internal/gen"
|
||||
)
|
||||
|
||||
const ascii = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" +
|
||||
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" +
|
||||
` !"#$%&'()*+,-./0123456789:;<=>?` +
|
||||
`@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` +
|
||||
"`abcdefghijklmnopqrstuvwxyz{|}~\u007f"
|
||||
|
||||
var encodings = []struct {
|
||||
name string
|
||||
mib string
|
||||
comment string
|
||||
varName string
|
||||
replacement byte
|
||||
mapping string
|
||||
}{
|
||||
{
|
||||
"IBM Code Page 037",
|
||||
"IBM037",
|
||||
"",
|
||||
"CodePage037",
|
||||
0x3f,
|
||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM037-2.1.2.ucm",
|
||||
},
|
||||
{
|
||||
"IBM Code Page 437",
|
||||
"PC8CodePage437",
|
||||
"",
|
||||
"CodePage437",
|
||||
encoding.ASCIISub,
|
||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM437-2.1.2.ucm",
|
||||
},
|
||||
{
|
||||
"IBM Code Page 850",
|
||||
"PC850Multilingual",
|
||||
"",
|
||||
"CodePage850",
|
||||
encoding.ASCIISub,
|
||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM850-2.1.2.ucm",
|
||||
},
|
||||
{
|
||||
"IBM Code Page 852",
|
||||
"PCp852",
|
||||
"",
|
||||
"CodePage852",
|
||||
encoding.ASCIISub,
|
||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM852-2.1.2.ucm",
|
||||
},
|
||||
{
|
||||
"IBM Code Page 855",
|
||||
"IBM855",
|
||||
"",
|
||||
"CodePage855",
|
||||
encoding.ASCIISub,
|
||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM855-2.1.2.ucm",
|
||||
},
|
||||
{
|
||||
"Windows Code Page 858", // PC latin1 with Euro
|
||||
"IBM00858",
|
||||
"",
|
||||
"CodePage858",
|
||||
encoding.ASCIISub,
|
||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/windows-858-2000.ucm",
|
||||
},
|
||||
{
|
||||
"IBM Code Page 860",
|
||||
"IBM860",
|
||||
"",
|
||||
"CodePage860",
|
||||
encoding.ASCIISub,
|
||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM860-2.1.2.ucm",
|
||||
},
|
||||
{
|
||||
"IBM Code Page 862",
|
||||
"PC862LatinHebrew",
|
||||
"",
|
||||
"CodePage862",
|
||||
encoding.ASCIISub,
|
||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM862-2.1.2.ucm",
|
||||
},
|
||||
{
|
||||
"IBM Code Page 863",
|
||||
"IBM863",
|
||||
"",
|
||||
"CodePage863",
|
||||
encoding.ASCIISub,
|
||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM863-2.1.2.ucm",
|
||||
},
|
||||
{
|
||||
"IBM Code Page 865",
|
||||
"IBM865",
|
||||
"",
|
||||
"CodePage865",
|
||||
encoding.ASCIISub,
|
||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM865-2.1.2.ucm",
|
||||
},
|
||||
{
|
||||
"IBM Code Page 866",
|
||||
"IBM866",
|
||||
"",
|
||||
"CodePage866",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-ibm866.txt",
|
||||
},
|
||||
{
|
||||
"IBM Code Page 1047",
|
||||
"IBM1047",
|
||||
"",
|
||||
"CodePage1047",
|
||||
0x3f,
|
||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM1047-2.1.2.ucm",
|
||||
},
|
||||
{
|
||||
"IBM Code Page 1140",
|
||||
"IBM01140",
|
||||
"",
|
||||
"CodePage1140",
|
||||
0x3f,
|
||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/ibm-1140_P100-1997.ucm",
|
||||
},
|
||||
{
|
||||
"ISO 8859-1",
|
||||
"ISOLatin1",
|
||||
"",
|
||||
"ISO8859_1",
|
||||
encoding.ASCIISub,
|
||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_1-1998.ucm",
|
||||
},
|
||||
{
|
||||
"ISO 8859-2",
|
||||
"ISOLatin2",
|
||||
"",
|
||||
"ISO8859_2",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-iso-8859-2.txt",
|
||||
},
|
||||
{
|
||||
"ISO 8859-3",
|
||||
"ISOLatin3",
|
||||
"",
|
||||
"ISO8859_3",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-iso-8859-3.txt",
|
||||
},
|
||||
{
|
||||
"ISO 8859-4",
|
||||
"ISOLatin4",
|
||||
"",
|
||||
"ISO8859_4",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-iso-8859-4.txt",
|
||||
},
|
||||
{
|
||||
"ISO 8859-5",
|
||||
"ISOLatinCyrillic",
|
||||
"",
|
||||
"ISO8859_5",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-iso-8859-5.txt",
|
||||
},
|
||||
{
|
||||
"ISO 8859-6",
|
||||
"ISOLatinArabic",
|
||||
"",
|
||||
"ISO8859_6,ISO8859_6E,ISO8859_6I",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-iso-8859-6.txt",
|
||||
},
|
||||
{
|
||||
"ISO 8859-7",
|
||||
"ISOLatinGreek",
|
||||
"",
|
||||
"ISO8859_7",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-iso-8859-7.txt",
|
||||
},
|
||||
{
|
||||
"ISO 8859-8",
|
||||
"ISOLatinHebrew",
|
||||
"",
|
||||
"ISO8859_8,ISO8859_8E,ISO8859_8I",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-iso-8859-8.txt",
|
||||
},
|
||||
{
|
||||
"ISO 8859-9",
|
||||
"ISOLatin5",
|
||||
"",
|
||||
"ISO8859_9",
|
||||
encoding.ASCIISub,
|
||||
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_9-1999.ucm",
|
||||
},
|
||||
{
|
||||
"ISO 8859-10",
|
||||
"ISOLatin6",
|
||||
"",
|
||||
"ISO8859_10",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-iso-8859-10.txt",
|
||||
},
|
||||
{
|
||||
"ISO 8859-13",
|
||||
"ISO885913",
|
||||
"",
|
||||
"ISO8859_13",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-iso-8859-13.txt",
|
||||
},
|
||||
{
|
||||
"ISO 8859-14",
|
||||
"ISO885914",
|
||||
"",
|
||||
"ISO8859_14",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-iso-8859-14.txt",
|
||||
},
|
||||
{
|
||||
"ISO 8859-15",
|
||||
"ISO885915",
|
||||
"",
|
||||
"ISO8859_15",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-iso-8859-15.txt",
|
||||
},
|
||||
{
|
||||
"ISO 8859-16",
|
||||
"ISO885916",
|
||||
"",
|
||||
"ISO8859_16",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-iso-8859-16.txt",
|
||||
},
|
||||
{
|
||||
"KOI8-R",
|
||||
"KOI8R",
|
||||
"",
|
||||
"KOI8R",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-koi8-r.txt",
|
||||
},
|
||||
{
|
||||
"KOI8-U",
|
||||
"KOI8U",
|
||||
"",
|
||||
"KOI8U",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-koi8-u.txt",
|
||||
},
|
||||
{
|
||||
"Macintosh",
|
||||
"Macintosh",
|
||||
"",
|
||||
"Macintosh",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-macintosh.txt",
|
||||
},
|
||||
{
|
||||
"Macintosh Cyrillic",
|
||||
"MacintoshCyrillic",
|
||||
"",
|
||||
"MacintoshCyrillic",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-x-mac-cyrillic.txt",
|
||||
},
|
||||
{
|
||||
"Windows 874",
|
||||
"Windows874",
|
||||
"",
|
||||
"Windows874",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-windows-874.txt",
|
||||
},
|
||||
{
|
||||
"Windows 1250",
|
||||
"Windows1250",
|
||||
"",
|
||||
"Windows1250",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-windows-1250.txt",
|
||||
},
|
||||
{
|
||||
"Windows 1251",
|
||||
"Windows1251",
|
||||
"",
|
||||
"Windows1251",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-windows-1251.txt",
|
||||
},
|
||||
{
|
||||
"Windows 1252",
|
||||
"Windows1252",
|
||||
"",
|
||||
"Windows1252",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-windows-1252.txt",
|
||||
},
|
||||
{
|
||||
"Windows 1253",
|
||||
"Windows1253",
|
||||
"",
|
||||
"Windows1253",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-windows-1253.txt",
|
||||
},
|
||||
{
|
||||
"Windows 1254",
|
||||
"Windows1254",
|
||||
"",
|
||||
"Windows1254",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-windows-1254.txt",
|
||||
},
|
||||
{
|
||||
"Windows 1255",
|
||||
"Windows1255",
|
||||
"",
|
||||
"Windows1255",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-windows-1255.txt",
|
||||
},
|
||||
{
|
||||
"Windows 1256",
|
||||
"Windows1256",
|
||||
"",
|
||||
"Windows1256",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-windows-1256.txt",
|
||||
},
|
||||
{
|
||||
"Windows 1257",
|
||||
"Windows1257",
|
||||
"",
|
||||
"Windows1257",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-windows-1257.txt",
|
||||
},
|
||||
{
|
||||
"Windows 1258",
|
||||
"Windows1258",
|
||||
"",
|
||||
"Windows1258",
|
||||
encoding.ASCIISub,
|
||||
"http://encoding.spec.whatwg.org/index-windows-1258.txt",
|
||||
},
|
||||
{
|
||||
"X-User-Defined",
|
||||
"XUserDefined",
|
||||
"It is defined at http://encoding.spec.whatwg.org/#x-user-defined",
|
||||
"XUserDefined",
|
||||
encoding.ASCIISub,
|
||||
ascii +
|
||||
"\uf780\uf781\uf782\uf783\uf784\uf785\uf786\uf787" +
|
||||
"\uf788\uf789\uf78a\uf78b\uf78c\uf78d\uf78e\uf78f" +
|
||||
"\uf790\uf791\uf792\uf793\uf794\uf795\uf796\uf797" +
|
||||
"\uf798\uf799\uf79a\uf79b\uf79c\uf79d\uf79e\uf79f" +
|
||||
"\uf7a0\uf7a1\uf7a2\uf7a3\uf7a4\uf7a5\uf7a6\uf7a7" +
|
||||
"\uf7a8\uf7a9\uf7aa\uf7ab\uf7ac\uf7ad\uf7ae\uf7af" +
|
||||
"\uf7b0\uf7b1\uf7b2\uf7b3\uf7b4\uf7b5\uf7b6\uf7b7" +
|
||||
"\uf7b8\uf7b9\uf7ba\uf7bb\uf7bc\uf7bd\uf7be\uf7bf" +
|
||||
"\uf7c0\uf7c1\uf7c2\uf7c3\uf7c4\uf7c5\uf7c6\uf7c7" +
|
||||
"\uf7c8\uf7c9\uf7ca\uf7cb\uf7cc\uf7cd\uf7ce\uf7cf" +
|
||||
"\uf7d0\uf7d1\uf7d2\uf7d3\uf7d4\uf7d5\uf7d6\uf7d7" +
|
||||
"\uf7d8\uf7d9\uf7da\uf7db\uf7dc\uf7dd\uf7de\uf7df" +
|
||||
"\uf7e0\uf7e1\uf7e2\uf7e3\uf7e4\uf7e5\uf7e6\uf7e7" +
|
||||
"\uf7e8\uf7e9\uf7ea\uf7eb\uf7ec\uf7ed\uf7ee\uf7ef" +
|
||||
"\uf7f0\uf7f1\uf7f2\uf7f3\uf7f4\uf7f5\uf7f6\uf7f7" +
|
||||
"\uf7f8\uf7f9\uf7fa\uf7fb\uf7fc\uf7fd\uf7fe\uf7ff",
|
||||
},
|
||||
}
|
||||
|
||||
func getWHATWG(url string) string {
|
||||
res, err := http.Get(url)
|
||||
if err != nil {
|
||||
log.Fatalf("%q: Get: %v", url, err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
mapping := make([]rune, 128)
|
||||
for i := range mapping {
|
||||
mapping[i] = '\ufffd'
|
||||
}
|
||||
|
||||
scanner := bufio.NewScanner(res.Body)
|
||||
for scanner.Scan() {
|
||||
s := strings.TrimSpace(scanner.Text())
|
||||
if s == "" || s[0] == '#' {
|
||||
continue
|
||||
}
|
||||
x, y := 0, 0
|
||||
if _, err := fmt.Sscanf(s, "%d\t0x%x", &x, &y); err != nil {
|
||||
log.Fatalf("could not parse %q", s)
|
||||
}
|
||||
if x < 0 || 128 <= x {
|
||||
log.Fatalf("code %d is out of range", x)
|
||||
}
|
||||
if 0x80 <= y && y < 0xa0 {
|
||||
// We diverge from the WHATWG spec by mapping control characters
|
||||
// in the range [0x80, 0xa0) to U+FFFD.
|
||||
continue
|
||||
}
|
||||
mapping[x] = rune(y)
|
||||
}
|
||||
return ascii + string(mapping)
|
||||
}
|
||||
|
||||
func getUCM(url string) string {
|
||||
res, err := http.Get(url)
|
||||
if err != nil {
|
||||
log.Fatalf("%q: Get: %v", url, err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
mapping := make([]rune, 256)
|
||||
for i := range mapping {
|
||||
mapping[i] = '\ufffd'
|
||||
}
|
||||
|
||||
charsFound := 0
|
||||
scanner := bufio.NewScanner(res.Body)
|
||||
for scanner.Scan() {
|
||||
s := strings.TrimSpace(scanner.Text())
|
||||
if s == "" || s[0] == '#' {
|
||||
continue
|
||||
}
|
||||
var c byte
|
||||
var r rune
|
||||
if _, err := fmt.Sscanf(s, `<U%x> \x%x |0`, &r, &c); err != nil {
|
||||
continue
|
||||
}
|
||||
mapping[c] = r
|
||||
charsFound++
|
||||
}
|
||||
|
||||
if charsFound < 200 {
|
||||
log.Fatalf("%q: only %d characters found (wrong page format?)", url, charsFound)
|
||||
}
|
||||
|
||||
return string(mapping)
|
||||
}
|
||||
|
||||
func main() {
|
||||
mibs := map[string]bool{}
|
||||
all := []string{}
|
||||
|
||||
w := gen.NewCodeWriter()
|
||||
defer w.WriteGoFile("tables.go", "charmap")
|
||||
|
||||
printf := func(s string, a ...interface{}) { fmt.Fprintf(w, s, a...) }
|
||||
|
||||
printf("import (\n")
|
||||
printf("\t\"golang.org/x/text/encoding\"\n")
|
||||
printf("\t\"golang.org/x/text/encoding/internal/identifier\"\n")
|
||||
printf(")\n\n")
|
||||
for _, e := range encodings {
|
||||
varNames := strings.Split(e.varName, ",")
|
||||
all = append(all, varNames...)
|
||||
varName := varNames[0]
|
||||
switch {
|
||||
case strings.HasPrefix(e.mapping, "http://encoding.spec.whatwg.org/"):
|
||||
e.mapping = getWHATWG(e.mapping)
|
||||
case strings.HasPrefix(e.mapping, "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/"):
|
||||
e.mapping = getUCM(e.mapping)
|
||||
}
|
||||
|
||||
asciiSuperset, low := strings.HasPrefix(e.mapping, ascii), 0x00
|
||||
if asciiSuperset {
|
||||
low = 0x80
|
||||
}
|
||||
lvn := 1
|
||||
if strings.HasPrefix(varName, "ISO") || strings.HasPrefix(varName, "KOI") {
|
||||
lvn = 3
|
||||
}
|
||||
lowerVarName := strings.ToLower(varName[:lvn]) + varName[lvn:]
|
||||
printf("// %s is the %s encoding.\n", varName, e.name)
|
||||
if e.comment != "" {
|
||||
printf("//\n// %s\n", e.comment)
|
||||
}
|
||||
printf("var %s encoding.Encoding = &%s\n\nvar %s = charmap{\nname: %q,\n",
|
||||
varName, lowerVarName, lowerVarName, e.name)
|
||||
if mibs[e.mib] {
|
||||
log.Fatalf("MIB type %q declared multiple times.", e.mib)
|
||||
}
|
||||
printf("mib: identifier.%s,\n", e.mib)
|
||||
printf("asciiSuperset: %t,\n", asciiSuperset)
|
||||
printf("low: 0x%02x,\n", low)
|
||||
printf("replacement: 0x%02x,\n", e.replacement)
|
||||
|
||||
printf("decode: [256]utf8Enc{\n")
|
||||
i, backMapping := 0, map[rune]byte{}
|
||||
for _, c := range e.mapping {
|
||||
if _, ok := backMapping[c]; !ok && c != utf8.RuneError {
|
||||
backMapping[c] = byte(i)
|
||||
}
|
||||
var buf [8]byte
|
||||
n := utf8.EncodeRune(buf[:], c)
|
||||
if n > 3 {
|
||||
panic(fmt.Sprintf("rune %q (%U) is too long", c, c))
|
||||
}
|
||||
printf("{%d,[3]byte{0x%02x,0x%02x,0x%02x}},", n, buf[0], buf[1], buf[2])
|
||||
if i%2 == 1 {
|
||||
printf("\n")
|
||||
}
|
||||
i++
|
||||
}
|
||||
printf("},\n")
|
||||
|
||||
printf("encode: [256]uint32{\n")
|
||||
encode := make([]uint32, 0, 256)
|
||||
for c, i := range backMapping {
|
||||
encode = append(encode, uint32(i)<<24|uint32(c))
|
||||
}
|
||||
sort.Sort(byRune(encode))
|
||||
for len(encode) < cap(encode) {
|
||||
encode = append(encode, encode[len(encode)-1])
|
||||
}
|
||||
for i, enc := range encode {
|
||||
printf("0x%08x,", enc)
|
||||
if i%8 == 7 {
|
||||
printf("\n")
|
||||
}
|
||||
}
|
||||
printf("},\n}\n")
|
||||
|
||||
// Add an estimate of the size of a single charmap{} struct value, which
|
||||
// includes two 256 elem arrays of 4 bytes and some extra fields, which
|
||||
// align to 3 uint64s on 64-bit architectures.
|
||||
w.Size += 2*4*256 + 3*8
|
||||
}
|
||||
// TODO: add proper line breaking.
|
||||
printf("var listAll = []encoding.Encoding{\n%s,\n}\n\n", strings.Join(all, ",\n"))
|
||||
}
|
||||
|
||||
type byRune []uint32
|
||||
|
||||
func (b byRune) Len() int { return len(b) }
|
||||
func (b byRune) Less(i, j int) bool { return b[i]&0xffffff < b[j]&0xffffff }
|
||||
func (b byRune) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
|
||||
+7410
File diff suppressed because it is too large
Load Diff
+335
@@ -0,0 +1,335 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package encoding defines an interface for character encodings, such as Shift
|
||||
// JIS and Windows 1252, that can convert to and from UTF-8.
|
||||
//
|
||||
// Encoding implementations are provided in other packages, such as
|
||||
// golang.org/x/text/encoding/charmap and
|
||||
// golang.org/x/text/encoding/japanese.
|
||||
package encoding // import "golang.org/x/text/encoding"
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"io"
|
||||
"strconv"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/encoding/internal/identifier"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// TODO:
|
||||
// - There seems to be some inconsistency in when decoders return errors
|
||||
// and when not. Also documentation seems to suggest they shouldn't return
|
||||
// errors at all (except for UTF-16).
|
||||
// - Encoders seem to rely on or at least benefit from the input being in NFC
|
||||
// normal form. Perhaps add an example how users could prepare their output.
|
||||
|
||||
// Encoding is a character set encoding that can be transformed to and from
|
||||
// UTF-8.
|
||||
type Encoding interface {
|
||||
// NewDecoder returns a Decoder.
|
||||
NewDecoder() *Decoder
|
||||
|
||||
// NewEncoder returns an Encoder.
|
||||
NewEncoder() *Encoder
|
||||
}
|
||||
|
||||
// A Decoder converts bytes to UTF-8. It implements transform.Transformer.
|
||||
//
|
||||
// Transforming source bytes that are not of that encoding will not result in an
|
||||
// error per se. Each byte that cannot be transcoded will be represented in the
|
||||
// output by the UTF-8 encoding of '\uFFFD', the replacement rune.
|
||||
type Decoder struct {
|
||||
transform.Transformer
|
||||
|
||||
// This forces external creators of Decoders to use names in struct
|
||||
// initializers, allowing for future extendibility without having to break
|
||||
// code.
|
||||
_ struct{}
|
||||
}
|
||||
|
||||
// Bytes converts the given encoded bytes to UTF-8. It returns the converted
|
||||
// bytes or nil, err if any error occurred.
|
||||
func (d *Decoder) Bytes(b []byte) ([]byte, error) {
|
||||
b, _, err := transform.Bytes(d, b)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return b, nil
|
||||
}
|
||||
|
||||
// String converts the given encoded string to UTF-8. It returns the converted
|
||||
// string or "", err if any error occurred.
|
||||
func (d *Decoder) String(s string) (string, error) {
|
||||
s, _, err := transform.String(d, s)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// Reader wraps another Reader to decode its bytes.
|
||||
//
|
||||
// The Decoder may not be used for any other operation as long as the returned
|
||||
// Reader is in use.
|
||||
func (d *Decoder) Reader(r io.Reader) io.Reader {
|
||||
return transform.NewReader(r, d)
|
||||
}
|
||||
|
||||
// An Encoder converts bytes from UTF-8. It implements transform.Transformer.
|
||||
//
|
||||
// Each rune that cannot be transcoded will result in an error. In this case,
|
||||
// the transform will consume all source byte up to, not including the offending
|
||||
// rune. Transforming source bytes that are not valid UTF-8 will be replaced by
|
||||
// `\uFFFD`. To return early with an error instead, use transform.Chain to
|
||||
// preprocess the data with a UTF8Validator.
|
||||
type Encoder struct {
|
||||
transform.Transformer
|
||||
|
||||
// This forces external creators of Encoders to use names in struct
|
||||
// initializers, allowing for future extendibility without having to break
|
||||
// code.
|
||||
_ struct{}
|
||||
}
|
||||
|
||||
// Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if
|
||||
// any error occurred.
|
||||
func (e *Encoder) Bytes(b []byte) ([]byte, error) {
|
||||
b, _, err := transform.Bytes(e, b)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return b, nil
|
||||
}
|
||||
|
||||
// String converts a string from UTF-8. It returns the converted string or
|
||||
// "", err if any error occurred.
|
||||
func (e *Encoder) String(s string) (string, error) {
|
||||
s, _, err := transform.String(e, s)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// Writer wraps another Writer to encode its UTF-8 output.
|
||||
//
|
||||
// The Encoder may not be used for any other operation as long as the returned
|
||||
// Writer is in use.
|
||||
func (e *Encoder) Writer(w io.Writer) io.Writer {
|
||||
return transform.NewWriter(w, e)
|
||||
}
|
||||
|
||||
// ASCIISub is the ASCII substitute character, as recommended by
|
||||
// http://unicode.org/reports/tr36/#Text_Comparison
|
||||
const ASCIISub = '\x1a'
|
||||
|
||||
// Nop is the nop encoding. Its transformed bytes are the same as the source
|
||||
// bytes; it does not replace invalid UTF-8 sequences.
|
||||
var Nop Encoding = nop{}
|
||||
|
||||
type nop struct{}
|
||||
|
||||
func (nop) NewDecoder() *Decoder {
|
||||
return &Decoder{Transformer: transform.Nop}
|
||||
}
|
||||
func (nop) NewEncoder() *Encoder {
|
||||
return &Encoder{Transformer: transform.Nop}
|
||||
}
|
||||
|
||||
// Replacement is the replacement encoding. Decoding from the replacement
|
||||
// encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to
|
||||
// the replacement encoding yields the same as the source bytes except that
|
||||
// invalid UTF-8 is converted to '\uFFFD'.
|
||||
//
|
||||
// It is defined at http://encoding.spec.whatwg.org/#replacement
|
||||
var Replacement Encoding = replacement{}
|
||||
|
||||
type replacement struct{}
|
||||
|
||||
func (replacement) NewDecoder() *Decoder {
|
||||
return &Decoder{Transformer: replacementDecoder{}}
|
||||
}
|
||||
|
||||
func (replacement) NewEncoder() *Encoder {
|
||||
return &Encoder{Transformer: replacementEncoder{}}
|
||||
}
|
||||
|
||||
func (replacement) ID() (mib identifier.MIB, other string) {
|
||||
return identifier.Replacement, ""
|
||||
}
|
||||
|
||||
type replacementDecoder struct{ transform.NopResetter }
|
||||
|
||||
func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
if len(dst) < 3 {
|
||||
return 0, 0, transform.ErrShortDst
|
||||
}
|
||||
if atEOF {
|
||||
const fffd = "\ufffd"
|
||||
dst[0] = fffd[0]
|
||||
dst[1] = fffd[1]
|
||||
dst[2] = fffd[2]
|
||||
nDst = 3
|
||||
}
|
||||
return nDst, len(src), nil
|
||||
}
|
||||
|
||||
type replacementEncoder struct{ transform.NopResetter }
|
||||
|
||||
func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
r = rune(src[nSrc])
|
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf {
|
||||
size = 1
|
||||
|
||||
} else {
|
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:])
|
||||
if size == 1 {
|
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
r = '\ufffd'
|
||||
}
|
||||
}
|
||||
|
||||
if nDst+utf8.RuneLen(r) > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
nDst += utf8.EncodeRune(dst[nDst:], r)
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
// HTMLEscapeUnsupported wraps encoders to replace source runes outside the
|
||||
// repertoire of the destination encoding with HTML escape sequences.
|
||||
//
|
||||
// This wrapper exists to comply to URL and HTML forms requiring a
|
||||
// non-terminating legacy encoder. The produced sequences may lead to data
|
||||
// loss as they are indistinguishable from legitimate input. To avoid this
|
||||
// issue, use UTF-8 encodings whenever possible.
|
||||
func HTMLEscapeUnsupported(e *Encoder) *Encoder {
|
||||
return &Encoder{Transformer: &errorHandler{e, errorToHTML}}
|
||||
}
|
||||
|
||||
// ReplaceUnsupported wraps encoders to replace source runes outside the
|
||||
// repertoire of the destination encoding with an encoding-specific
|
||||
// replacement.
|
||||
//
|
||||
// This wrapper is only provided for backwards compatibility and legacy
|
||||
// handling. Its use is strongly discouraged. Use UTF-8 whenever possible.
|
||||
func ReplaceUnsupported(e *Encoder) *Encoder {
|
||||
return &Encoder{Transformer: &errorHandler{e, errorToReplacement}}
|
||||
}
|
||||
|
||||
type errorHandler struct {
|
||||
*Encoder
|
||||
handler func(dst []byte, r rune, err repertoireError) (n int, ok bool)
|
||||
}
|
||||
|
||||
// TODO: consider making this error public in some form.
|
||||
type repertoireError interface {
|
||||
Replacement() byte
|
||||
}
|
||||
|
||||
func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF)
|
||||
for err != nil {
|
||||
rerr, ok := err.(repertoireError)
|
||||
if !ok {
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
r, sz := utf8.DecodeRune(src[nSrc:])
|
||||
n, ok := h.handler(dst[nDst:], r, rerr)
|
||||
if !ok {
|
||||
return nDst, nSrc, transform.ErrShortDst
|
||||
}
|
||||
err = nil
|
||||
nDst += n
|
||||
if nSrc += sz; nSrc < len(src) {
|
||||
var dn, sn int
|
||||
dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF)
|
||||
nDst += dn
|
||||
nSrc += sn
|
||||
}
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) {
|
||||
buf := [8]byte{}
|
||||
b := strconv.AppendUint(buf[:0], uint64(r), 10)
|
||||
if n = len(b) + len("&#;"); n >= len(dst) {
|
||||
return 0, false
|
||||
}
|
||||
dst[0] = '&'
|
||||
dst[1] = '#'
|
||||
dst[copy(dst[2:], b)+2] = ';'
|
||||
return n, true
|
||||
}
|
||||
|
||||
func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) {
|
||||
if len(dst) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
dst[0] = err.Replacement()
|
||||
return 1, true
|
||||
}
|
||||
|
||||
// ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.
|
||||
var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8")
|
||||
|
||||
// UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first
|
||||
// input byte that is not valid UTF-8.
|
||||
var UTF8Validator transform.Transformer = utf8Validator{}
|
||||
|
||||
type utf8Validator struct{ transform.NopResetter }
|
||||
|
||||
func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
n := len(src)
|
||||
if n > len(dst) {
|
||||
n = len(dst)
|
||||
}
|
||||
for i := 0; i < n; {
|
||||
if c := src[i]; c < utf8.RuneSelf {
|
||||
dst[i] = c
|
||||
i++
|
||||
continue
|
||||
}
|
||||
_, size := utf8.DecodeRune(src[i:])
|
||||
if size == 1 {
|
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
err = ErrInvalidUTF8
|
||||
if !atEOF && !utf8.FullRune(src[i:]) {
|
||||
err = transform.ErrShortSrc
|
||||
}
|
||||
return i, i, err
|
||||
}
|
||||
if i+size > len(dst) {
|
||||
return i, i, transform.ErrShortDst
|
||||
}
|
||||
for ; size > 0; size-- {
|
||||
dst[i] = src[i]
|
||||
i++
|
||||
}
|
||||
}
|
||||
if len(src) > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
}
|
||||
return n, n, err
|
||||
}
|
||||
+170
@@ -0,0 +1,170 @@
|
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/text/internal/gen"
|
||||
)
|
||||
|
||||
type group struct {
|
||||
Encodings []struct {
|
||||
Labels []string
|
||||
Name string
|
||||
}
|
||||
}
|
||||
|
||||
func main() {
|
||||
gen.Init()
|
||||
|
||||
r := gen.Open("https://encoding.spec.whatwg.org", "whatwg", "encodings.json")
|
||||
var groups []group
|
||||
if err := json.NewDecoder(r).Decode(&groups); err != nil {
|
||||
log.Fatalf("Error reading encodings.json: %v", err)
|
||||
}
|
||||
|
||||
w := &bytes.Buffer{}
|
||||
fmt.Fprintln(w, "type htmlEncoding byte")
|
||||
fmt.Fprintln(w, "const (")
|
||||
for i, g := range groups {
|
||||
for _, e := range g.Encodings {
|
||||
key := strings.ToLower(e.Name)
|
||||
name := consts[key]
|
||||
if name == "" {
|
||||
log.Fatalf("No const defined for %s.", key)
|
||||
}
|
||||
if i == 0 {
|
||||
fmt.Fprintf(w, "%s htmlEncoding = iota\n", name)
|
||||
} else {
|
||||
fmt.Fprintf(w, "%s\n", name)
|
||||
}
|
||||
}
|
||||
}
|
||||
fmt.Fprintln(w, "numEncodings")
|
||||
fmt.Fprint(w, ")\n\n")
|
||||
|
||||
fmt.Fprintln(w, "var canonical = [numEncodings]string{")
|
||||
for _, g := range groups {
|
||||
for _, e := range g.Encodings {
|
||||
fmt.Fprintf(w, "%q,\n", strings.ToLower(e.Name))
|
||||
}
|
||||
}
|
||||
fmt.Fprint(w, "}\n\n")
|
||||
|
||||
fmt.Fprintln(w, "var nameMap = map[string]htmlEncoding{")
|
||||
for _, g := range groups {
|
||||
for _, e := range g.Encodings {
|
||||
for _, l := range e.Labels {
|
||||
key := strings.ToLower(e.Name)
|
||||
name := consts[key]
|
||||
fmt.Fprintf(w, "%q: %s,\n", l, name)
|
||||
}
|
||||
}
|
||||
}
|
||||
fmt.Fprint(w, "}\n\n")
|
||||
|
||||
var tags []string
|
||||
fmt.Fprintln(w, "var localeMap = []htmlEncoding{")
|
||||
for _, loc := range locales {
|
||||
tags = append(tags, loc.tag)
|
||||
fmt.Fprintf(w, "%s, // %s \n", consts[loc.name], loc.tag)
|
||||
}
|
||||
fmt.Fprint(w, "}\n\n")
|
||||
|
||||
fmt.Fprintf(w, "const locales = %q\n", strings.Join(tags, " "))
|
||||
|
||||
gen.WriteGoFile("tables.go", "htmlindex", w.Bytes())
|
||||
}
|
||||
|
||||
// consts maps canonical encoding name to internal constant.
|
||||
var consts = map[string]string{
|
||||
"utf-8": "utf8",
|
||||
"ibm866": "ibm866",
|
||||
"iso-8859-2": "iso8859_2",
|
||||
"iso-8859-3": "iso8859_3",
|
||||
"iso-8859-4": "iso8859_4",
|
||||
"iso-8859-5": "iso8859_5",
|
||||
"iso-8859-6": "iso8859_6",
|
||||
"iso-8859-7": "iso8859_7",
|
||||
"iso-8859-8": "iso8859_8",
|
||||
"iso-8859-8-i": "iso8859_8I",
|
||||
"iso-8859-10": "iso8859_10",
|
||||
"iso-8859-13": "iso8859_13",
|
||||
"iso-8859-14": "iso8859_14",
|
||||
"iso-8859-15": "iso8859_15",
|
||||
"iso-8859-16": "iso8859_16",
|
||||
"koi8-r": "koi8r",
|
||||
"koi8-u": "koi8u",
|
||||
"macintosh": "macintosh",
|
||||
"windows-874": "windows874",
|
||||
"windows-1250": "windows1250",
|
||||
"windows-1251": "windows1251",
|
||||
"windows-1252": "windows1252",
|
||||
"windows-1253": "windows1253",
|
||||
"windows-1254": "windows1254",
|
||||
"windows-1255": "windows1255",
|
||||
"windows-1256": "windows1256",
|
||||
"windows-1257": "windows1257",
|
||||
"windows-1258": "windows1258",
|
||||
"x-mac-cyrillic": "macintoshCyrillic",
|
||||
"gbk": "gbk",
|
||||
"gb18030": "gb18030",
|
||||
// "hz-gb-2312": "hzgb2312", // Was removed from WhatWG
|
||||
"big5": "big5",
|
||||
"euc-jp": "eucjp",
|
||||
"iso-2022-jp": "iso2022jp",
|
||||
"shift_jis": "shiftJIS",
|
||||
"euc-kr": "euckr",
|
||||
"replacement": "replacement",
|
||||
"utf-16be": "utf16be",
|
||||
"utf-16le": "utf16le",
|
||||
"x-user-defined": "xUserDefined",
|
||||
}
|
||||
|
||||
// locales is taken from
|
||||
// https://html.spec.whatwg.org/multipage/syntax.html#encoding-sniffing-algorithm.
|
||||
var locales = []struct{ tag, name string }{
|
||||
{"und", "windows-1252"}, // The default value.
|
||||
{"ar", "windows-1256"},
|
||||
{"ba", "windows-1251"},
|
||||
{"be", "windows-1251"},
|
||||
{"bg", "windows-1251"},
|
||||
{"cs", "windows-1250"},
|
||||
{"el", "iso-8859-7"},
|
||||
{"et", "windows-1257"},
|
||||
{"fa", "windows-1256"},
|
||||
{"he", "windows-1255"},
|
||||
{"hr", "windows-1250"},
|
||||
{"hu", "iso-8859-2"},
|
||||
{"ja", "shift_jis"},
|
||||
{"kk", "windows-1251"},
|
||||
{"ko", "euc-kr"},
|
||||
{"ku", "windows-1254"},
|
||||
{"ky", "windows-1251"},
|
||||
{"lt", "windows-1257"},
|
||||
{"lv", "windows-1257"},
|
||||
{"mk", "windows-1251"},
|
||||
{"pl", "iso-8859-2"},
|
||||
{"ru", "windows-1251"},
|
||||
{"sah", "windows-1251"},
|
||||
{"sk", "windows-1250"},
|
||||
{"sl", "iso-8859-2"},
|
||||
{"sr", "windows-1251"},
|
||||
{"tg", "windows-1251"},
|
||||
{"th", "windows-874"},
|
||||
{"tr", "windows-1254"},
|
||||
{"tt", "windows-1251"},
|
||||
{"uk", "windows-1251"},
|
||||
{"vi", "windows-1258"},
|
||||
{"zh-hans", "gb18030"},
|
||||
{"zh-hant", "big5"},
|
||||
}
|
||||
+86
@@ -0,0 +1,86 @@
|
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:generate go run gen.go
|
||||
|
||||
// Package htmlindex maps character set encoding names to Encodings as
|
||||
// recommended by the W3C for use in HTML 5. See http://www.w3.org/TR/encoding.
|
||||
package htmlindex
|
||||
|
||||
// TODO: perhaps have a "bare" version of the index (used by this package) that
|
||||
// is not pre-loaded with all encodings. Global variables in encodings prevent
|
||||
// the linker from being able to purge unneeded tables. This means that
|
||||
// referencing all encodings, as this package does for the default index, links
|
||||
// in all encodings unconditionally.
|
||||
//
|
||||
// This issue can be solved by either solving the linking issue (see
|
||||
// https://github.com/golang/go/issues/6330) or refactoring the encoding tables
|
||||
// (e.g. moving the tables to internal packages that do not use global
|
||||
// variables).
|
||||
|
||||
// TODO: allow canonicalizing names
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"golang.org/x/text/encoding"
|
||||
"golang.org/x/text/encoding/internal/identifier"
|
||||
"golang.org/x/text/language"
|
||||
)
|
||||
|
||||
var (
|
||||
errInvalidName = errors.New("htmlindex: invalid encoding name")
|
||||
errUnknown = errors.New("htmlindex: unknown Encoding")
|
||||
errUnsupported = errors.New("htmlindex: this encoding is not supported")
|
||||
)
|
||||
|
||||
var (
|
||||
matcherOnce sync.Once
|
||||
matcher language.Matcher
|
||||
)
|
||||
|
||||
// LanguageDefault returns the canonical name of the default encoding for a
|
||||
// given language.
|
||||
func LanguageDefault(tag language.Tag) string {
|
||||
matcherOnce.Do(func() {
|
||||
tags := []language.Tag{}
|
||||
for _, t := range strings.Split(locales, " ") {
|
||||
tags = append(tags, language.MustParse(t))
|
||||
}
|
||||
matcher = language.NewMatcher(tags)
|
||||
})
|
||||
_, i, _ := matcher.Match(tag)
|
||||
return canonical[localeMap[i]] // Default is Windows-1252.
|
||||
}
|
||||
|
||||
// Get returns an Encoding for one of the names listed in
|
||||
// http://www.w3.org/TR/encoding using the Default Index. Matching is case-
|
||||
// insensitive.
|
||||
func Get(name string) (encoding.Encoding, error) {
|
||||
x, ok := nameMap[strings.ToLower(strings.TrimSpace(name))]
|
||||
if !ok {
|
||||
return nil, errInvalidName
|
||||
}
|
||||
return encodings[x], nil
|
||||
}
|
||||
|
||||
// Name reports the canonical name of the given Encoding. It will return
|
||||
// an error if e is not associated with a supported encoding scheme.
|
||||
func Name(e encoding.Encoding) (string, error) {
|
||||
id, ok := e.(identifier.Interface)
|
||||
if !ok {
|
||||
return "", errUnknown
|
||||
}
|
||||
mib, _ := id.ID()
|
||||
if mib == 0 {
|
||||
return "", errUnknown
|
||||
}
|
||||
v, ok := mibMap[mib]
|
||||
if !ok {
|
||||
return "", errUnsupported
|
||||
}
|
||||
return canonical[v], nil
|
||||
}
|
||||
+105
@@ -0,0 +1,105 @@
|
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package htmlindex
|
||||
|
||||
import (
|
||||
"golang.org/x/text/encoding"
|
||||
"golang.org/x/text/encoding/charmap"
|
||||
"golang.org/x/text/encoding/internal/identifier"
|
||||
"golang.org/x/text/encoding/japanese"
|
||||
"golang.org/x/text/encoding/korean"
|
||||
"golang.org/x/text/encoding/simplifiedchinese"
|
||||
"golang.org/x/text/encoding/traditionalchinese"
|
||||
"golang.org/x/text/encoding/unicode"
|
||||
)
|
||||
|
||||
// mibMap maps a MIB identifier to an htmlEncoding index.
|
||||
var mibMap = map[identifier.MIB]htmlEncoding{
|
||||
identifier.UTF8: utf8,
|
||||
identifier.UTF16BE: utf16be,
|
||||
identifier.UTF16LE: utf16le,
|
||||
identifier.IBM866: ibm866,
|
||||
identifier.ISOLatin2: iso8859_2,
|
||||
identifier.ISOLatin3: iso8859_3,
|
||||
identifier.ISOLatin4: iso8859_4,
|
||||
identifier.ISOLatinCyrillic: iso8859_5,
|
||||
identifier.ISOLatinArabic: iso8859_6,
|
||||
identifier.ISOLatinGreek: iso8859_7,
|
||||
identifier.ISOLatinHebrew: iso8859_8,
|
||||
identifier.ISO88598I: iso8859_8I,
|
||||
identifier.ISOLatin6: iso8859_10,
|
||||
identifier.ISO885913: iso8859_13,
|
||||
identifier.ISO885914: iso8859_14,
|
||||
identifier.ISO885915: iso8859_15,
|
||||
identifier.ISO885916: iso8859_16,
|
||||
identifier.KOI8R: koi8r,
|
||||
identifier.KOI8U: koi8u,
|
||||
identifier.Macintosh: macintosh,
|
||||
identifier.MacintoshCyrillic: macintoshCyrillic,
|
||||
identifier.Windows874: windows874,
|
||||
identifier.Windows1250: windows1250,
|
||||
identifier.Windows1251: windows1251,
|
||||
identifier.Windows1252: windows1252,
|
||||
identifier.Windows1253: windows1253,
|
||||
identifier.Windows1254: windows1254,
|
||||
identifier.Windows1255: windows1255,
|
||||
identifier.Windows1256: windows1256,
|
||||
identifier.Windows1257: windows1257,
|
||||
identifier.Windows1258: windows1258,
|
||||
identifier.XUserDefined: xUserDefined,
|
||||
identifier.GBK: gbk,
|
||||
identifier.GB18030: gb18030,
|
||||
identifier.Big5: big5,
|
||||
identifier.EUCPkdFmtJapanese: eucjp,
|
||||
identifier.ISO2022JP: iso2022jp,
|
||||
identifier.ShiftJIS: shiftJIS,
|
||||
identifier.EUCKR: euckr,
|
||||
identifier.Replacement: replacement,
|
||||
}
|
||||
|
||||
// encodings maps the internal htmlEncoding to an Encoding.
|
||||
// TODO: consider using a reusable index in encoding/internal.
|
||||
var encodings = [numEncodings]encoding.Encoding{
|
||||
utf8: unicode.UTF8,
|
||||
ibm866: charmap.CodePage866,
|
||||
iso8859_2: charmap.ISO8859_2,
|
||||
iso8859_3: charmap.ISO8859_3,
|
||||
iso8859_4: charmap.ISO8859_4,
|
||||
iso8859_5: charmap.ISO8859_5,
|
||||
iso8859_6: charmap.ISO8859_6,
|
||||
iso8859_7: charmap.ISO8859_7,
|
||||
iso8859_8: charmap.ISO8859_8,
|
||||
iso8859_8I: charmap.ISO8859_8I,
|
||||
iso8859_10: charmap.ISO8859_10,
|
||||
iso8859_13: charmap.ISO8859_13,
|
||||
iso8859_14: charmap.ISO8859_14,
|
||||
iso8859_15: charmap.ISO8859_15,
|
||||
iso8859_16: charmap.ISO8859_16,
|
||||
koi8r: charmap.KOI8R,
|
||||
koi8u: charmap.KOI8U,
|
||||
macintosh: charmap.Macintosh,
|
||||
windows874: charmap.Windows874,
|
||||
windows1250: charmap.Windows1250,
|
||||
windows1251: charmap.Windows1251,
|
||||
windows1252: charmap.Windows1252,
|
||||
windows1253: charmap.Windows1253,
|
||||
windows1254: charmap.Windows1254,
|
||||
windows1255: charmap.Windows1255,
|
||||
windows1256: charmap.Windows1256,
|
||||
windows1257: charmap.Windows1257,
|
||||
windows1258: charmap.Windows1258,
|
||||
macintoshCyrillic: charmap.MacintoshCyrillic,
|
||||
gbk: simplifiedchinese.GBK,
|
||||
gb18030: simplifiedchinese.GB18030,
|
||||
big5: traditionalchinese.Big5,
|
||||
eucjp: japanese.EUCJP,
|
||||
iso2022jp: japanese.ISO2022JP,
|
||||
shiftJIS: japanese.ShiftJIS,
|
||||
euckr: korean.EUCKR,
|
||||
replacement: encoding.Replacement,
|
||||
utf16be: unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM),
|
||||
utf16le: unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM),
|
||||
xUserDefined: charmap.XUserDefined,
|
||||
}
|
||||
+352
@@ -0,0 +1,352 @@
|
||||
// This file was generated by go generate; DO NOT EDIT
|
||||
|
||||
package htmlindex
|
||||
|
||||
type htmlEncoding byte
|
||||
|
||||
const (
|
||||
utf8 htmlEncoding = iota
|
||||
ibm866
|
||||
iso8859_2
|
||||
iso8859_3
|
||||
iso8859_4
|
||||
iso8859_5
|
||||
iso8859_6
|
||||
iso8859_7
|
||||
iso8859_8
|
||||
iso8859_8I
|
||||
iso8859_10
|
||||
iso8859_13
|
||||
iso8859_14
|
||||
iso8859_15
|
||||
iso8859_16
|
||||
koi8r
|
||||
koi8u
|
||||
macintosh
|
||||
windows874
|
||||
windows1250
|
||||
windows1251
|
||||
windows1252
|
||||
windows1253
|
||||
windows1254
|
||||
windows1255
|
||||
windows1256
|
||||
windows1257
|
||||
windows1258
|
||||
macintoshCyrillic
|
||||
gbk
|
||||
gb18030
|
||||
big5
|
||||
eucjp
|
||||
iso2022jp
|
||||
shiftJIS
|
||||
euckr
|
||||
replacement
|
||||
utf16be
|
||||
utf16le
|
||||
xUserDefined
|
||||
numEncodings
|
||||
)
|
||||
|
||||
var canonical = [numEncodings]string{
|
||||
"utf-8",
|
||||
"ibm866",
|
||||
"iso-8859-2",
|
||||
"iso-8859-3",
|
||||
"iso-8859-4",
|
||||
"iso-8859-5",
|
||||
"iso-8859-6",
|
||||
"iso-8859-7",
|
||||
"iso-8859-8",
|
||||
"iso-8859-8-i",
|
||||
"iso-8859-10",
|
||||
"iso-8859-13",
|
||||
"iso-8859-14",
|
||||
"iso-8859-15",
|
||||
"iso-8859-16",
|
||||
"koi8-r",
|
||||
"koi8-u",
|
||||
"macintosh",
|
||||
"windows-874",
|
||||
"windows-1250",
|
||||
"windows-1251",
|
||||
"windows-1252",
|
||||
"windows-1253",
|
||||
"windows-1254",
|
||||
"windows-1255",
|
||||
"windows-1256",
|
||||
"windows-1257",
|
||||
"windows-1258",
|
||||
"x-mac-cyrillic",
|
||||
"gbk",
|
||||
"gb18030",
|
||||
"big5",
|
||||
"euc-jp",
|
||||
"iso-2022-jp",
|
||||
"shift_jis",
|
||||
"euc-kr",
|
||||
"replacement",
|
||||
"utf-16be",
|
||||
"utf-16le",
|
||||
"x-user-defined",
|
||||
}
|
||||
|
||||
var nameMap = map[string]htmlEncoding{
|
||||
"unicode-1-1-utf-8": utf8,
|
||||
"utf-8": utf8,
|
||||
"utf8": utf8,
|
||||
"866": ibm866,
|
||||
"cp866": ibm866,
|
||||
"csibm866": ibm866,
|
||||
"ibm866": ibm866,
|
||||
"csisolatin2": iso8859_2,
|
||||
"iso-8859-2": iso8859_2,
|
||||
"iso-ir-101": iso8859_2,
|
||||
"iso8859-2": iso8859_2,
|
||||
"iso88592": iso8859_2,
|
||||
"iso_8859-2": iso8859_2,
|
||||
"iso_8859-2:1987": iso8859_2,
|
||||
"l2": iso8859_2,
|
||||
"latin2": iso8859_2,
|
||||
"csisolatin3": iso8859_3,
|
||||
"iso-8859-3": iso8859_3,
|
||||
"iso-ir-109": iso8859_3,
|
||||
"iso8859-3": iso8859_3,
|
||||
"iso88593": iso8859_3,
|
||||
"iso_8859-3": iso8859_3,
|
||||
"iso_8859-3:1988": iso8859_3,
|
||||
"l3": iso8859_3,
|
||||
"latin3": iso8859_3,
|
||||
"csisolatin4": iso8859_4,
|
||||
"iso-8859-4": iso8859_4,
|
||||
"iso-ir-110": iso8859_4,
|
||||
"iso8859-4": iso8859_4,
|
||||
"iso88594": iso8859_4,
|
||||
"iso_8859-4": iso8859_4,
|
||||
"iso_8859-4:1988": iso8859_4,
|
||||
"l4": iso8859_4,
|
||||
"latin4": iso8859_4,
|
||||
"csisolatincyrillic": iso8859_5,
|
||||
"cyrillic": iso8859_5,
|
||||
"iso-8859-5": iso8859_5,
|
||||
"iso-ir-144": iso8859_5,
|
||||
"iso8859-5": iso8859_5,
|
||||
"iso88595": iso8859_5,
|
||||
"iso_8859-5": iso8859_5,
|
||||
"iso_8859-5:1988": iso8859_5,
|
||||
"arabic": iso8859_6,
|
||||
"asmo-708": iso8859_6,
|
||||
"csiso88596e": iso8859_6,
|
||||
"csiso88596i": iso8859_6,
|
||||
"csisolatinarabic": iso8859_6,
|
||||
"ecma-114": iso8859_6,
|
||||
"iso-8859-6": iso8859_6,
|
||||
"iso-8859-6-e": iso8859_6,
|
||||
"iso-8859-6-i": iso8859_6,
|
||||
"iso-ir-127": iso8859_6,
|
||||
"iso8859-6": iso8859_6,
|
||||
"iso88596": iso8859_6,
|
||||
"iso_8859-6": iso8859_6,
|
||||
"iso_8859-6:1987": iso8859_6,
|
||||
"csisolatingreek": iso8859_7,
|
||||
"ecma-118": iso8859_7,
|
||||
"elot_928": iso8859_7,
|
||||
"greek": iso8859_7,
|
||||
"greek8": iso8859_7,
|
||||
"iso-8859-7": iso8859_7,
|
||||
"iso-ir-126": iso8859_7,
|
||||
"iso8859-7": iso8859_7,
|
||||
"iso88597": iso8859_7,
|
||||
"iso_8859-7": iso8859_7,
|
||||
"iso_8859-7:1987": iso8859_7,
|
||||
"sun_eu_greek": iso8859_7,
|
||||
"csiso88598e": iso8859_8,
|
||||
"csisolatinhebrew": iso8859_8,
|
||||
"hebrew": iso8859_8,
|
||||
"iso-8859-8": iso8859_8,
|
||||
"iso-8859-8-e": iso8859_8,
|
||||
"iso-ir-138": iso8859_8,
|
||||
"iso8859-8": iso8859_8,
|
||||
"iso88598": iso8859_8,
|
||||
"iso_8859-8": iso8859_8,
|
||||
"iso_8859-8:1988": iso8859_8,
|
||||
"visual": iso8859_8,
|
||||
"csiso88598i": iso8859_8I,
|
||||
"iso-8859-8-i": iso8859_8I,
|
||||
"logical": iso8859_8I,
|
||||
"csisolatin6": iso8859_10,
|
||||
"iso-8859-10": iso8859_10,
|
||||
"iso-ir-157": iso8859_10,
|
||||
"iso8859-10": iso8859_10,
|
||||
"iso885910": iso8859_10,
|
||||
"l6": iso8859_10,
|
||||
"latin6": iso8859_10,
|
||||
"iso-8859-13": iso8859_13,
|
||||
"iso8859-13": iso8859_13,
|
||||
"iso885913": iso8859_13,
|
||||
"iso-8859-14": iso8859_14,
|
||||
"iso8859-14": iso8859_14,
|
||||
"iso885914": iso8859_14,
|
||||
"csisolatin9": iso8859_15,
|
||||
"iso-8859-15": iso8859_15,
|
||||
"iso8859-15": iso8859_15,
|
||||
"iso885915": iso8859_15,
|
||||
"iso_8859-15": iso8859_15,
|
||||
"l9": iso8859_15,
|
||||
"iso-8859-16": iso8859_16,
|
||||
"cskoi8r": koi8r,
|
||||
"koi": koi8r,
|
||||
"koi8": koi8r,
|
||||
"koi8-r": koi8r,
|
||||
"koi8_r": koi8r,
|
||||
"koi8-ru": koi8u,
|
||||
"koi8-u": koi8u,
|
||||
"csmacintosh": macintosh,
|
||||
"mac": macintosh,
|
||||
"macintosh": macintosh,
|
||||
"x-mac-roman": macintosh,
|
||||
"dos-874": windows874,
|
||||
"iso-8859-11": windows874,
|
||||
"iso8859-11": windows874,
|
||||
"iso885911": windows874,
|
||||
"tis-620": windows874,
|
||||
"windows-874": windows874,
|
||||
"cp1250": windows1250,
|
||||
"windows-1250": windows1250,
|
||||
"x-cp1250": windows1250,
|
||||
"cp1251": windows1251,
|
||||
"windows-1251": windows1251,
|
||||
"x-cp1251": windows1251,
|
||||
"ansi_x3.4-1968": windows1252,
|
||||
"ascii": windows1252,
|
||||
"cp1252": windows1252,
|
||||
"cp819": windows1252,
|
||||
"csisolatin1": windows1252,
|
||||
"ibm819": windows1252,
|
||||
"iso-8859-1": windows1252,
|
||||
"iso-ir-100": windows1252,
|
||||
"iso8859-1": windows1252,
|
||||
"iso88591": windows1252,
|
||||
"iso_8859-1": windows1252,
|
||||
"iso_8859-1:1987": windows1252,
|
||||
"l1": windows1252,
|
||||
"latin1": windows1252,
|
||||
"us-ascii": windows1252,
|
||||
"windows-1252": windows1252,
|
||||
"x-cp1252": windows1252,
|
||||
"cp1253": windows1253,
|
||||
"windows-1253": windows1253,
|
||||
"x-cp1253": windows1253,
|
||||
"cp1254": windows1254,
|
||||
"csisolatin5": windows1254,
|
||||
"iso-8859-9": windows1254,
|
||||
"iso-ir-148": windows1254,
|
||||
"iso8859-9": windows1254,
|
||||
"iso88599": windows1254,
|
||||
"iso_8859-9": windows1254,
|
||||
"iso_8859-9:1989": windows1254,
|
||||
"l5": windows1254,
|
||||
"latin5": windows1254,
|
||||
"windows-1254": windows1254,
|
||||
"x-cp1254": windows1254,
|
||||
"cp1255": windows1255,
|
||||
"windows-1255": windows1255,
|
||||
"x-cp1255": windows1255,
|
||||
"cp1256": windows1256,
|
||||
"windows-1256": windows1256,
|
||||
"x-cp1256": windows1256,
|
||||
"cp1257": windows1257,
|
||||
"windows-1257": windows1257,
|
||||
"x-cp1257": windows1257,
|
||||
"cp1258": windows1258,
|
||||
"windows-1258": windows1258,
|
||||
"x-cp1258": windows1258,
|
||||
"x-mac-cyrillic": macintoshCyrillic,
|
||||
"x-mac-ukrainian": macintoshCyrillic,
|
||||
"chinese": gbk,
|
||||
"csgb2312": gbk,
|
||||
"csiso58gb231280": gbk,
|
||||
"gb2312": gbk,
|
||||
"gb_2312": gbk,
|
||||
"gb_2312-80": gbk,
|
||||
"gbk": gbk,
|
||||
"iso-ir-58": gbk,
|
||||
"x-gbk": gbk,
|
||||
"gb18030": gb18030,
|
||||
"big5": big5,
|
||||
"big5-hkscs": big5,
|
||||
"cn-big5": big5,
|
||||
"csbig5": big5,
|
||||
"x-x-big5": big5,
|
||||
"cseucpkdfmtjapanese": eucjp,
|
||||
"euc-jp": eucjp,
|
||||
"x-euc-jp": eucjp,
|
||||
"csiso2022jp": iso2022jp,
|
||||
"iso-2022-jp": iso2022jp,
|
||||
"csshiftjis": shiftJIS,
|
||||
"ms932": shiftJIS,
|
||||
"ms_kanji": shiftJIS,
|
||||
"shift-jis": shiftJIS,
|
||||
"shift_jis": shiftJIS,
|
||||
"sjis": shiftJIS,
|
||||
"windows-31j": shiftJIS,
|
||||
"x-sjis": shiftJIS,
|
||||
"cseuckr": euckr,
|
||||
"csksc56011987": euckr,
|
||||
"euc-kr": euckr,
|
||||
"iso-ir-149": euckr,
|
||||
"korean": euckr,
|
||||
"ks_c_5601-1987": euckr,
|
||||
"ks_c_5601-1989": euckr,
|
||||
"ksc5601": euckr,
|
||||
"ksc_5601": euckr,
|
||||
"windows-949": euckr,
|
||||
"csiso2022kr": replacement,
|
||||
"hz-gb-2312": replacement,
|
||||
"iso-2022-cn": replacement,
|
||||
"iso-2022-cn-ext": replacement,
|
||||
"iso-2022-kr": replacement,
|
||||
"utf-16be": utf16be,
|
||||
"utf-16": utf16le,
|
||||
"utf-16le": utf16le,
|
||||
"x-user-defined": xUserDefined,
|
||||
}
|
||||
|
||||
var localeMap = []htmlEncoding{
|
||||
windows1252, // und
|
||||
windows1256, // ar
|
||||
windows1251, // ba
|
||||
windows1251, // be
|
||||
windows1251, // bg
|
||||
windows1250, // cs
|
||||
iso8859_7, // el
|
||||
windows1257, // et
|
||||
windows1256, // fa
|
||||
windows1255, // he
|
||||
windows1250, // hr
|
||||
iso8859_2, // hu
|
||||
shiftJIS, // ja
|
||||
windows1251, // kk
|
||||
euckr, // ko
|
||||
windows1254, // ku
|
||||
windows1251, // ky
|
||||
windows1257, // lt
|
||||
windows1257, // lv
|
||||
windows1251, // mk
|
||||
iso8859_2, // pl
|
||||
windows1251, // ru
|
||||
windows1251, // sah
|
||||
windows1250, // sk
|
||||
iso8859_2, // sl
|
||||
windows1251, // sr
|
||||
windows1251, // tg
|
||||
windows874, // th
|
||||
windows1254, // tr
|
||||
windows1251, // tt
|
||||
windows1251, // uk
|
||||
windows1258, // vi
|
||||
gb18030, // zh-hans
|
||||
big5, // zh-hant
|
||||
}
|
||||
|
||||
const locales = "und ar ba be bg cs el et fa he hr hu ja kk ko ku ky lt lv mk pl ru sah sk sl sr tg th tr tt uk vi zh-hans zh-hant"
|
||||
+67
@@ -0,0 +1,67 @@
|
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package ianaindex maps names to Encodings as specified by the IANA registry.
|
||||
// This includes both the MIME and IANA names.
|
||||
//
|
||||
// Status: this package is an incomplete API sketch, and isn't usable yet.
|
||||
//
|
||||
// See http://www.iana.org/assignments/character-sets/character-sets.xhtml for
|
||||
// more details.
|
||||
package ianaindex
|
||||
|
||||
import (
|
||||
"golang.org/x/text/encoding"
|
||||
)
|
||||
|
||||
// TODO: remove the "Status... incomplete" in the package doc comment.
|
||||
// TODO: allow users to specify their own aliases?
|
||||
// TODO: allow users to specify their own indexes?
|
||||
// TODO: allow canonicalizing names
|
||||
|
||||
// NOTE: only use these top-level variables if we can get the linker to drop
|
||||
// the indexes when they are not used. Make them a function or perhaps only
|
||||
// support MIME otherwise.
|
||||
|
||||
var (
|
||||
// MIME is an index to map MIME names. It does not support aliases.
|
||||
MIME *Index
|
||||
|
||||
// IANA is an index that supports all names and aliases using IANA names as
|
||||
// the canonical identifier.
|
||||
IANA *Index
|
||||
)
|
||||
|
||||
// Index maps names registered by IANA to Encodings.
|
||||
type Index struct {
|
||||
}
|
||||
|
||||
// Get returns an Encoding for IANA-registered names. Matching is
|
||||
// case-insensitive.
|
||||
func (x *Index) Get(name string) (encoding.Encoding, error) {
|
||||
panic("TODO: implement")
|
||||
}
|
||||
|
||||
// Name reports the canonical name of the given Encoding. It will return an
|
||||
// error if the e is not associated with a known encoding scheme.
|
||||
func (x *Index) Name(e encoding.Encoding) (string, error) {
|
||||
panic("TODO: implement")
|
||||
}
|
||||
|
||||
// TODO: the coverage of this index is rather spotty. Allowing users to set
|
||||
// encodings would allow:
|
||||
// - users to increase coverage
|
||||
// - allow a partially loaded set of encodings in case the user doesn't need to
|
||||
// them all.
|
||||
// - write an OS-specific wrapper for supported encodings and set them.
|
||||
// The exact definition of Set depends a bit on if and how we want to let users
|
||||
// write their own Encoding implementations. Also, it is not possible yet to
|
||||
// only partially load the encodings without doing some refactoring. Until this
|
||||
// is solved, we might as well not support Set.
|
||||
// // Set sets the e to be used for the encoding scheme identified by name. Only
|
||||
// // canonical names may be used. An empty name assigns e to its internally
|
||||
// // associated encoding scheme.
|
||||
// func (x *Index) Set(name string, e encoding.Encoding) error {
|
||||
// panic("TODO: implement")
|
||||
// }
|
||||
+137
@@ -0,0 +1,137 @@
|
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/text/internal/gen"
|
||||
)
|
||||
|
||||
type registry struct {
|
||||
XMLName xml.Name `xml:"registry"`
|
||||
Updated string `xml:"updated"`
|
||||
Registry []struct {
|
||||
ID string `xml:"id,attr"`
|
||||
Record []struct {
|
||||
Name string `xml:"name"`
|
||||
Xref []struct {
|
||||
Type string `xml:"type,attr"`
|
||||
Data string `xml:"data,attr"`
|
||||
} `xml:"xref"`
|
||||
Desc struct {
|
||||
Data string `xml:",innerxml"`
|
||||
// Any []struct {
|
||||
// Data string `xml:",chardata"`
|
||||
// } `xml:",any"`
|
||||
// Data string `xml:",chardata"`
|
||||
} `xml:"description,"`
|
||||
MIB string `xml:"value"`
|
||||
Alias []string `xml:"alias"`
|
||||
MIME string `xml:"preferred_alias"`
|
||||
} `xml:"record"`
|
||||
} `xml:"registry"`
|
||||
}
|
||||
|
||||
func main() {
|
||||
r := gen.OpenIANAFile("assignments/character-sets/character-sets.xml")
|
||||
reg := ®istry{}
|
||||
if err := xml.NewDecoder(r).Decode(®); err != nil && err != io.EOF {
|
||||
log.Fatalf("Error decoding charset registry: %v", err)
|
||||
}
|
||||
if len(reg.Registry) == 0 || reg.Registry[0].ID != "character-sets-1" {
|
||||
log.Fatalf("Unexpected ID %s", reg.Registry[0].ID)
|
||||
}
|
||||
|
||||
w := &bytes.Buffer{}
|
||||
fmt.Fprintf(w, "const (\n")
|
||||
for _, rec := range reg.Registry[0].Record {
|
||||
constName := ""
|
||||
for _, a := range rec.Alias {
|
||||
if strings.HasPrefix(a, "cs") && strings.IndexByte(a, '-') == -1 {
|
||||
// Some of the constant definitions have comments in them. Strip those.
|
||||
constName = strings.Title(strings.SplitN(a[2:], "\n", 2)[0])
|
||||
}
|
||||
}
|
||||
if constName == "" {
|
||||
switch rec.MIB {
|
||||
case "2085":
|
||||
constName = "HZGB2312" // Not listed as alias for some reason.
|
||||
default:
|
||||
log.Fatalf("No cs alias defined for %s.", rec.MIB)
|
||||
}
|
||||
}
|
||||
if rec.MIME != "" {
|
||||
rec.MIME = fmt.Sprintf(" (MIME: %s)", rec.MIME)
|
||||
}
|
||||
fmt.Fprintf(w, "// %s is the MIB identifier with IANA name %s%s.\n//\n", constName, rec.Name, rec.MIME)
|
||||
if len(rec.Desc.Data) > 0 {
|
||||
fmt.Fprint(w, "// ")
|
||||
d := xml.NewDecoder(strings.NewReader(rec.Desc.Data))
|
||||
inElem := true
|
||||
attr := ""
|
||||
for {
|
||||
t, err := d.Token()
|
||||
if err != nil {
|
||||
if err != io.EOF {
|
||||
log.Fatal(err)
|
||||
}
|
||||
break
|
||||
}
|
||||
switch x := t.(type) {
|
||||
case xml.CharData:
|
||||
attr = "" // Don't need attribute info.
|
||||
a := bytes.Split([]byte(x), []byte("\n"))
|
||||
for i, b := range a {
|
||||
if b = bytes.TrimSpace(b); len(b) != 0 {
|
||||
if !inElem && i > 0 {
|
||||
fmt.Fprint(w, "\n// ")
|
||||
}
|
||||
inElem = false
|
||||
fmt.Fprintf(w, "%s ", string(b))
|
||||
}
|
||||
}
|
||||
case xml.StartElement:
|
||||
if x.Name.Local == "xref" {
|
||||
inElem = true
|
||||
use := false
|
||||
for _, a := range x.Attr {
|
||||
if a.Name.Local == "type" {
|
||||
use = use || a.Value != "person"
|
||||
}
|
||||
if a.Name.Local == "data" && use {
|
||||
attr = a.Value + " "
|
||||
}
|
||||
}
|
||||
}
|
||||
case xml.EndElement:
|
||||
inElem = false
|
||||
fmt.Fprint(w, attr)
|
||||
}
|
||||
}
|
||||
fmt.Fprint(w, "\n")
|
||||
}
|
||||
for _, x := range rec.Xref {
|
||||
switch x.Type {
|
||||
case "rfc":
|
||||
fmt.Fprintf(w, "// Reference: %s\n", strings.ToUpper(x.Data))
|
||||
case "uri":
|
||||
fmt.Fprintf(w, "// Reference: %s\n", x.Data)
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(w, "%s MIB = %s\n", constName, rec.MIB)
|
||||
fmt.Fprintln(w)
|
||||
}
|
||||
fmt.Fprintln(w, ")")
|
||||
|
||||
gen.WriteGoFile("mib.go", "identifier", w.Bytes())
|
||||
}
|
||||
+81
@@ -0,0 +1,81 @@
|
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:generate go run gen.go
|
||||
|
||||
// Package identifier defines the contract between implementations of Encoding
|
||||
// and Index by defining identifiers that uniquely identify standardized coded
|
||||
// character sets (CCS) and character encoding schemes (CES), which we will
|
||||
// together refer to as encodings, for which Encoding implementations provide
|
||||
// converters to and from UTF-8. This package is typically only of concern to
|
||||
// implementers of Indexes and Encodings.
|
||||
//
|
||||
// One part of the identifier is the MIB code, which is defined by IANA and
|
||||
// uniquely identifies a CCS or CES. Each code is associated with data that
|
||||
// references authorities, official documentation as well as aliases and MIME
|
||||
// names.
|
||||
//
|
||||
// Not all CESs are covered by the IANA registry. The "other" string that is
|
||||
// returned by ID can be used to identify other character sets or versions of
|
||||
// existing ones.
|
||||
//
|
||||
// It is recommended that each package that provides a set of Encodings provide
|
||||
// the All and Common variables to reference all supported encodings and
|
||||
// commonly used subset. This allows Index implementations to include all
|
||||
// available encodings without explicitly referencing or knowing about them.
|
||||
package identifier
|
||||
|
||||
// Note: this package is internal, but could be made public if there is a need
|
||||
// for writing third-party Indexes and Encodings.
|
||||
|
||||
// References:
|
||||
// - http://source.icu-project.org/repos/icu/icu/trunk/source/data/mappings/convrtrs.txt
|
||||
// - http://www.iana.org/assignments/character-sets/character-sets.xhtml
|
||||
// - http://www.iana.org/assignments/ianacharset-mib/ianacharset-mib
|
||||
// - http://www.ietf.org/rfc/rfc2978.txt
|
||||
// - http://www.unicode.org/reports/tr22/
|
||||
// - http://www.w3.org/TR/encoding/
|
||||
// - https://encoding.spec.whatwg.org/
|
||||
// - https://encoding.spec.whatwg.org/encodings.json
|
||||
// - https://tools.ietf.org/html/rfc6657#section-5
|
||||
|
||||
// Interface can be implemented by Encodings to define the CCS or CES for which
|
||||
// it implements conversions.
|
||||
type Interface interface {
|
||||
// ID returns an encoding identifier. Exactly one of the mib and other
|
||||
// values should be non-zero.
|
||||
//
|
||||
// In the usual case it is only necessary to indicate the MIB code. The
|
||||
// other string can be used to specify encodings for which there is no MIB,
|
||||
// such as "x-mac-dingbat".
|
||||
//
|
||||
// The other string may only contain the characters a-z, A-Z, 0-9, - and _.
|
||||
ID() (mib MIB, other string)
|
||||
|
||||
// NOTE: the restrictions on the encoding are to allow extending the syntax
|
||||
// with additional information such as versions, vendors and other variants.
|
||||
}
|
||||
|
||||
// A MIB identifies an encoding. It is derived from the IANA MIB codes and adds
|
||||
// some identifiers for some encodings that are not covered by the IANA
|
||||
// standard.
|
||||
//
|
||||
// See http://www.iana.org/assignments/ianacharset-mib.
|
||||
type MIB uint16
|
||||
|
||||
// These additional MIB types are not defined in IANA. They are added because
|
||||
// they are common and defined within the text repo.
|
||||
const (
|
||||
// Unofficial marks the start of encodings not registered by IANA.
|
||||
Unofficial MIB = 10000 + iota
|
||||
|
||||
// Replacement is the WhatWG replacement encoding.
|
||||
Replacement
|
||||
|
||||
// XUserDefined is the code for x-user-defined.
|
||||
XUserDefined
|
||||
|
||||
// MacintoshCyrillic is the code for x-mac-cyrillic.
|
||||
MacintoshCyrillic
|
||||
)
|
||||
+1621
File diff suppressed because it is too large
Load Diff
+75
@@ -0,0 +1,75 @@
|
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package internal contains code that is shared among encoding implementations.
|
||||
package internal
|
||||
|
||||
import (
|
||||
"golang.org/x/text/encoding"
|
||||
"golang.org/x/text/encoding/internal/identifier"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// Encoding is an implementation of the Encoding interface that adds the String
|
||||
// and ID methods to an existing encoding.
|
||||
type Encoding struct {
|
||||
encoding.Encoding
|
||||
Name string
|
||||
MIB identifier.MIB
|
||||
}
|
||||
|
||||
// _ verifies that Encoding implements identifier.Interface.
|
||||
var _ identifier.Interface = (*Encoding)(nil)
|
||||
|
||||
func (e *Encoding) String() string {
|
||||
return e.Name
|
||||
}
|
||||
|
||||
func (e *Encoding) ID() (mib identifier.MIB, other string) {
|
||||
return e.MIB, ""
|
||||
}
|
||||
|
||||
// SimpleEncoding is an Encoding that combines two Transformers.
|
||||
type SimpleEncoding struct {
|
||||
Decoder transform.Transformer
|
||||
Encoder transform.Transformer
|
||||
}
|
||||
|
||||
func (e *SimpleEncoding) NewDecoder() *encoding.Decoder {
|
||||
return &encoding.Decoder{Transformer: e.Decoder}
|
||||
}
|
||||
|
||||
func (e *SimpleEncoding) NewEncoder() *encoding.Encoder {
|
||||
return &encoding.Encoder{Transformer: e.Encoder}
|
||||
}
|
||||
|
||||
// FuncEncoding is an Encoding that combines two functions returning a new
|
||||
// Transformer.
|
||||
type FuncEncoding struct {
|
||||
Decoder func() transform.Transformer
|
||||
Encoder func() transform.Transformer
|
||||
}
|
||||
|
||||
func (e FuncEncoding) NewDecoder() *encoding.Decoder {
|
||||
return &encoding.Decoder{Transformer: e.Decoder()}
|
||||
}
|
||||
|
||||
func (e FuncEncoding) NewEncoder() *encoding.Encoder {
|
||||
return &encoding.Encoder{Transformer: e.Encoder()}
|
||||
}
|
||||
|
||||
// A RepertoireError indicates a rune is not in the repertoire of a destination
|
||||
// encoding. It is associated with an encoding-specific suggested replacement
|
||||
// byte.
|
||||
type RepertoireError byte
|
||||
|
||||
// Error implements the error interrface.
|
||||
func (r RepertoireError) Error() string {
|
||||
return "encoding: rune not supported by encoding."
|
||||
}
|
||||
|
||||
// Replacement returns the replacement string associated with this error.
|
||||
func (r RepertoireError) Replacement() byte { return byte(r) }
|
||||
|
||||
var ErrASCIIReplacement = RepertoireError(encoding.ASCIISub)
|
||||
+12
@@ -0,0 +1,12 @@
|
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package japanese
|
||||
|
||||
import (
|
||||
"golang.org/x/text/encoding"
|
||||
)
|
||||
|
||||
// All is a list of all defined encodings in this package.
|
||||
var All = []encoding.Encoding{EUCJP, ISO2022JP, ShiftJIS}
|
||||
+211
@@ -0,0 +1,211 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package japanese
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/encoding"
|
||||
"golang.org/x/text/encoding/internal"
|
||||
"golang.org/x/text/encoding/internal/identifier"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// EUCJP is the EUC-JP encoding.
|
||||
var EUCJP encoding.Encoding = &eucJP
|
||||
|
||||
var eucJP = internal.Encoding{
|
||||
&internal.SimpleEncoding{eucJPDecoder{}, eucJPEncoder{}},
|
||||
"EUC-JP",
|
||||
identifier.EUCPkdFmtJapanese,
|
||||
}
|
||||
|
||||
var errInvalidEUCJP = errors.New("japanese: invalid EUC-JP encoding")
|
||||
|
||||
type eucJPDecoder struct{ transform.NopResetter }
|
||||
|
||||
func (eucJPDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
loop:
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
switch c0 := src[nSrc]; {
|
||||
case c0 < utf8.RuneSelf:
|
||||
r, size = rune(c0), 1
|
||||
|
||||
case c0 == 0x8e:
|
||||
if nSrc+1 >= len(src) {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
}
|
||||
c1 := src[nSrc+1]
|
||||
if c1 < 0xa1 || 0xdf < c1 {
|
||||
err = errInvalidEUCJP
|
||||
break loop
|
||||
}
|
||||
r, size = rune(c1)+(0xff61-0xa1), 2
|
||||
|
||||
case c0 == 0x8f:
|
||||
if nSrc+2 >= len(src) {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
}
|
||||
c1 := src[nSrc+1]
|
||||
if c1 < 0xa1 || 0xfe < c1 {
|
||||
err = errInvalidEUCJP
|
||||
break loop
|
||||
}
|
||||
c2 := src[nSrc+2]
|
||||
if c2 < 0xa1 || 0xfe < c2 {
|
||||
err = errInvalidEUCJP
|
||||
break loop
|
||||
}
|
||||
r, size = '\ufffd', 3
|
||||
if i := int(c1-0xa1)*94 + int(c2-0xa1); i < len(jis0212Decode) {
|
||||
r = rune(jis0212Decode[i])
|
||||
if r == 0 {
|
||||
r = '\ufffd'
|
||||
}
|
||||
}
|
||||
|
||||
case 0xa1 <= c0 && c0 <= 0xfe:
|
||||
if nSrc+1 >= len(src) {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
}
|
||||
c1 := src[nSrc+1]
|
||||
if c1 < 0xa1 || 0xfe < c1 {
|
||||
err = errInvalidEUCJP
|
||||
break loop
|
||||
}
|
||||
r, size = '\ufffd', 2
|
||||
if i := int(c0-0xa1)*94 + int(c1-0xa1); i < len(jis0208Decode) {
|
||||
r = rune(jis0208Decode[i])
|
||||
if r == 0 {
|
||||
r = '\ufffd'
|
||||
}
|
||||
}
|
||||
|
||||
default:
|
||||
err = errInvalidEUCJP
|
||||
break loop
|
||||
}
|
||||
|
||||
if nDst+utf8.RuneLen(r) > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
}
|
||||
nDst += utf8.EncodeRune(dst[nDst:], r)
|
||||
}
|
||||
if atEOF && err == transform.ErrShortSrc {
|
||||
err = errInvalidEUCJP
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
type eucJPEncoder struct{ transform.NopResetter }
|
||||
|
||||
func (eucJPEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
r = rune(src[nSrc])
|
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf {
|
||||
size = 1
|
||||
|
||||
} else {
|
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:])
|
||||
if size == 1 {
|
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// func init checks that the switch covers all tables.
|
||||
switch {
|
||||
case encode0Low <= r && r < encode0High:
|
||||
if r = rune(encode0[r-encode0Low]); r != 0 {
|
||||
goto write2or3
|
||||
}
|
||||
case encode1Low <= r && r < encode1High:
|
||||
if r = rune(encode1[r-encode1Low]); r != 0 {
|
||||
goto write2or3
|
||||
}
|
||||
case encode2Low <= r && r < encode2High:
|
||||
if r = rune(encode2[r-encode2Low]); r != 0 {
|
||||
goto write2or3
|
||||
}
|
||||
case encode3Low <= r && r < encode3High:
|
||||
if r = rune(encode3[r-encode3Low]); r != 0 {
|
||||
goto write2or3
|
||||
}
|
||||
case encode4Low <= r && r < encode4High:
|
||||
if r = rune(encode4[r-encode4Low]); r != 0 {
|
||||
goto write2or3
|
||||
}
|
||||
case encode5Low <= r && r < encode5High:
|
||||
if 0xff61 <= r && r < 0xffa0 {
|
||||
goto write2
|
||||
}
|
||||
if r = rune(encode5[r-encode5Low]); r != 0 {
|
||||
goto write2or3
|
||||
}
|
||||
}
|
||||
err = internal.ErrASCIIReplacement
|
||||
break
|
||||
}
|
||||
|
||||
if nDst >= len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst] = uint8(r)
|
||||
nDst++
|
||||
continue
|
||||
|
||||
write2or3:
|
||||
if r>>tableShift == jis0208 {
|
||||
if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
} else {
|
||||
if nDst+3 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst] = 0x8f
|
||||
nDst++
|
||||
}
|
||||
dst[nDst+0] = 0xa1 + uint8(r>>codeShift)&codeMask
|
||||
dst[nDst+1] = 0xa1 + uint8(r)&codeMask
|
||||
nDst += 2
|
||||
continue
|
||||
|
||||
write2:
|
||||
if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+0] = 0x8e
|
||||
dst[nDst+1] = uint8(r - (0xff61 - 0xa1))
|
||||
nDst += 2
|
||||
continue
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
// Check that the hard-coded encode switch covers all tables.
|
||||
if numEncodeTables != 6 {
|
||||
panic("bad numEncodeTables")
|
||||
}
|
||||
}
|
||||
+296
@@ -0,0 +1,296 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package japanese
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/encoding"
|
||||
"golang.org/x/text/encoding/internal"
|
||||
"golang.org/x/text/encoding/internal/identifier"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// ISO2022JP is the ISO-2022-JP encoding.
|
||||
var ISO2022JP encoding.Encoding = &iso2022JP
|
||||
|
||||
var iso2022JP = internal.Encoding{
|
||||
internal.FuncEncoding{iso2022JPNewDecoder, iso2022JPNewEncoder},
|
||||
"ISO-2022-JP",
|
||||
identifier.ISO2022JP,
|
||||
}
|
||||
|
||||
func iso2022JPNewDecoder() transform.Transformer {
|
||||
return new(iso2022JPDecoder)
|
||||
}
|
||||
|
||||
func iso2022JPNewEncoder() transform.Transformer {
|
||||
return new(iso2022JPEncoder)
|
||||
}
|
||||
|
||||
var errInvalidISO2022JP = errors.New("japanese: invalid ISO-2022-JP encoding")
|
||||
|
||||
const (
|
||||
asciiState = iota
|
||||
katakanaState
|
||||
jis0208State
|
||||
jis0212State
|
||||
)
|
||||
|
||||
const asciiEsc = 0x1b
|
||||
|
||||
type iso2022JPDecoder int
|
||||
|
||||
func (d *iso2022JPDecoder) Reset() {
|
||||
*d = asciiState
|
||||
}
|
||||
|
||||
func (d *iso2022JPDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
loop:
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
c0 := src[nSrc]
|
||||
if c0 >= utf8.RuneSelf {
|
||||
err = errInvalidISO2022JP
|
||||
break loop
|
||||
}
|
||||
|
||||
if c0 == asciiEsc {
|
||||
if nSrc+2 >= len(src) {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
}
|
||||
size = 3
|
||||
c1 := src[nSrc+1]
|
||||
c2 := src[nSrc+2]
|
||||
switch {
|
||||
case c1 == '$' && (c2 == '@' || c2 == 'B'):
|
||||
*d = jis0208State
|
||||
continue
|
||||
case c1 == '$' && c2 == '(':
|
||||
if nSrc+3 >= len(src) {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
}
|
||||
size = 4
|
||||
if src[nSrc]+3 == 'D' {
|
||||
*d = jis0212State
|
||||
continue
|
||||
}
|
||||
case c1 == '(' && (c2 == 'B' || c2 == 'J'):
|
||||
*d = asciiState
|
||||
continue
|
||||
case c1 == '(' && c2 == 'I':
|
||||
*d = katakanaState
|
||||
continue
|
||||
}
|
||||
err = errInvalidISO2022JP
|
||||
break loop
|
||||
}
|
||||
|
||||
switch *d {
|
||||
case asciiState:
|
||||
r, size = rune(c0), 1
|
||||
|
||||
case katakanaState:
|
||||
if c0 < 0x21 || 0x60 <= c0 {
|
||||
err = errInvalidISO2022JP
|
||||
break loop
|
||||
}
|
||||
r, size = rune(c0)+(0xff61-0x21), 1
|
||||
|
||||
default:
|
||||
if c0 == 0x0a {
|
||||
*d = asciiState
|
||||
r, size = rune(c0), 1
|
||||
break
|
||||
}
|
||||
if nSrc+1 >= len(src) {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
}
|
||||
size = 2
|
||||
c1 := src[nSrc+1]
|
||||
i := int(c0-0x21)*94 + int(c1-0x21)
|
||||
if *d == jis0208State && i < len(jis0208Decode) {
|
||||
r = rune(jis0208Decode[i])
|
||||
} else if *d == jis0212State && i < len(jis0212Decode) {
|
||||
r = rune(jis0212Decode[i])
|
||||
} else {
|
||||
r = '\ufffd'
|
||||
break
|
||||
}
|
||||
if r == 0 {
|
||||
r = '\ufffd'
|
||||
}
|
||||
}
|
||||
|
||||
if nDst+utf8.RuneLen(r) > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
}
|
||||
nDst += utf8.EncodeRune(dst[nDst:], r)
|
||||
}
|
||||
if atEOF && err == transform.ErrShortSrc {
|
||||
err = errInvalidISO2022JP
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
type iso2022JPEncoder int
|
||||
|
||||
func (e *iso2022JPEncoder) Reset() {
|
||||
*e = asciiState
|
||||
}
|
||||
|
||||
func (e *iso2022JPEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
r = rune(src[nSrc])
|
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf {
|
||||
size = 1
|
||||
|
||||
} else {
|
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:])
|
||||
if size == 1 {
|
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// func init checks that the switch covers all tables.
|
||||
//
|
||||
// http://encoding.spec.whatwg.org/#iso-2022-jp says that "the index jis0212
|
||||
// is not used by the iso-2022-jp encoder due to lack of widespread support".
|
||||
//
|
||||
// TODO: do we have to special-case U+00A5 and U+203E, as per
|
||||
// http://encoding.spec.whatwg.org/#iso-2022-jp
|
||||
// Doing so would mean that "\u00a5" would not be preserved
|
||||
// after an encode-decode round trip.
|
||||
switch {
|
||||
case encode0Low <= r && r < encode0High:
|
||||
if r = rune(encode0[r-encode0Low]); r>>tableShift == jis0208 {
|
||||
goto writeJIS
|
||||
}
|
||||
case encode1Low <= r && r < encode1High:
|
||||
if r = rune(encode1[r-encode1Low]); r>>tableShift == jis0208 {
|
||||
goto writeJIS
|
||||
}
|
||||
case encode2Low <= r && r < encode2High:
|
||||
if r = rune(encode2[r-encode2Low]); r>>tableShift == jis0208 {
|
||||
goto writeJIS
|
||||
}
|
||||
case encode3Low <= r && r < encode3High:
|
||||
if r = rune(encode3[r-encode3Low]); r>>tableShift == jis0208 {
|
||||
goto writeJIS
|
||||
}
|
||||
case encode4Low <= r && r < encode4High:
|
||||
if r = rune(encode4[r-encode4Low]); r>>tableShift == jis0208 {
|
||||
goto writeJIS
|
||||
}
|
||||
case encode5Low <= r && r < encode5High:
|
||||
if 0xff61 <= r && r < 0xffa0 {
|
||||
goto writeKatakana
|
||||
}
|
||||
if r = rune(encode5[r-encode5Low]); r>>tableShift == jis0208 {
|
||||
goto writeJIS
|
||||
}
|
||||
}
|
||||
|
||||
// Switch back to ASCII state in case of error so that an ASCII
|
||||
// replacement character can be written in the correct state.
|
||||
if *e != asciiState {
|
||||
if nDst+3 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
*e = asciiState
|
||||
dst[nDst+0] = asciiEsc
|
||||
dst[nDst+1] = '('
|
||||
dst[nDst+2] = 'B'
|
||||
nDst += 3
|
||||
}
|
||||
err = internal.ErrASCIIReplacement
|
||||
break
|
||||
}
|
||||
|
||||
if *e != asciiState {
|
||||
if nDst+4 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
*e = asciiState
|
||||
dst[nDst+0] = asciiEsc
|
||||
dst[nDst+1] = '('
|
||||
dst[nDst+2] = 'B'
|
||||
nDst += 3
|
||||
} else if nDst >= len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst] = uint8(r)
|
||||
nDst++
|
||||
continue
|
||||
|
||||
writeJIS:
|
||||
if *e != jis0208State {
|
||||
if nDst+5 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
*e = jis0208State
|
||||
dst[nDst+0] = asciiEsc
|
||||
dst[nDst+1] = '$'
|
||||
dst[nDst+2] = 'B'
|
||||
nDst += 3
|
||||
} else if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+0] = 0x21 + uint8(r>>codeShift)&codeMask
|
||||
dst[nDst+1] = 0x21 + uint8(r)&codeMask
|
||||
nDst += 2
|
||||
continue
|
||||
|
||||
writeKatakana:
|
||||
if *e != katakanaState {
|
||||
if nDst+4 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
*e = katakanaState
|
||||
dst[nDst+0] = asciiEsc
|
||||
dst[nDst+1] = '('
|
||||
dst[nDst+2] = 'I'
|
||||
nDst += 3
|
||||
} else if nDst >= len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst] = uint8(r - (0xff61 - 0x21))
|
||||
nDst++
|
||||
continue
|
||||
}
|
||||
if atEOF && err == nil && *e != asciiState {
|
||||
if nDst+3 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
} else {
|
||||
*e = asciiState
|
||||
dst[nDst+0] = asciiEsc
|
||||
dst[nDst+1] = '('
|
||||
dst[nDst+2] = 'B'
|
||||
nDst += 3
|
||||
}
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
+161
@@ -0,0 +1,161 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
// This program generates tables.go:
|
||||
// go run maketables.go | gofmt > tables.go
|
||||
|
||||
// TODO: Emoji extensions?
|
||||
// http://www.unicode.org/faq/emoji_dingbats.html
|
||||
// http://www.unicode.org/Public/UNIDATA/EmojiSources.txt
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type entry struct {
|
||||
jisCode, table int
|
||||
}
|
||||
|
||||
func main() {
|
||||
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n")
|
||||
fmt.Printf("// Package japanese provides Japanese encodings such as EUC-JP and Shift JIS.\n")
|
||||
fmt.Printf(`package japanese // import "golang.org/x/text/encoding/japanese"` + "\n\n")
|
||||
|
||||
reverse := [65536]entry{}
|
||||
for i := range reverse {
|
||||
reverse[i].table = -1
|
||||
}
|
||||
|
||||
tables := []struct {
|
||||
url string
|
||||
name string
|
||||
}{
|
||||
{"http://encoding.spec.whatwg.org/index-jis0208.txt", "0208"},
|
||||
{"http://encoding.spec.whatwg.org/index-jis0212.txt", "0212"},
|
||||
}
|
||||
for i, table := range tables {
|
||||
res, err := http.Get(table.url)
|
||||
if err != nil {
|
||||
log.Fatalf("%q: Get: %v", table.url, err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
mapping := [65536]uint16{}
|
||||
|
||||
scanner := bufio.NewScanner(res.Body)
|
||||
for scanner.Scan() {
|
||||
s := strings.TrimSpace(scanner.Text())
|
||||
if s == "" || s[0] == '#' {
|
||||
continue
|
||||
}
|
||||
x, y := 0, uint16(0)
|
||||
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
|
||||
log.Fatalf("%q: could not parse %q", table.url, s)
|
||||
}
|
||||
if x < 0 || 120*94 <= x {
|
||||
log.Fatalf("%q: JIS code %d is out of range", table.url, x)
|
||||
}
|
||||
mapping[x] = y
|
||||
if reverse[y].table == -1 {
|
||||
reverse[y] = entry{jisCode: x, table: i}
|
||||
}
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
log.Fatalf("%q: scanner error: %v", table.url, err)
|
||||
}
|
||||
|
||||
fmt.Printf("// jis%sDecode is the decoding table from JIS %s code to Unicode.\n// It is defined at %s\n",
|
||||
table.name, table.name, table.url)
|
||||
fmt.Printf("var jis%sDecode = [...]uint16{\n", table.name)
|
||||
for i, m := range mapping {
|
||||
if m != 0 {
|
||||
fmt.Printf("\t%d: 0x%04X,\n", i, m)
|
||||
}
|
||||
}
|
||||
fmt.Printf("}\n\n")
|
||||
}
|
||||
|
||||
// Any run of at least separation continuous zero entries in the reverse map will
|
||||
// be a separate encode table.
|
||||
const separation = 1024
|
||||
|
||||
intervals := []interval(nil)
|
||||
low, high := -1, -1
|
||||
for i, v := range reverse {
|
||||
if v.table == -1 {
|
||||
continue
|
||||
}
|
||||
if low < 0 {
|
||||
low = i
|
||||
} else if i-high >= separation {
|
||||
if high >= 0 {
|
||||
intervals = append(intervals, interval{low, high})
|
||||
}
|
||||
low = i
|
||||
}
|
||||
high = i + 1
|
||||
}
|
||||
if high >= 0 {
|
||||
intervals = append(intervals, interval{low, high})
|
||||
}
|
||||
sort.Sort(byDecreasingLength(intervals))
|
||||
|
||||
fmt.Printf("const (\n")
|
||||
fmt.Printf("\tjis0208 = 1\n")
|
||||
fmt.Printf("\tjis0212 = 2\n")
|
||||
fmt.Printf("\tcodeMask = 0x7f\n")
|
||||
fmt.Printf("\tcodeShift = 7\n")
|
||||
fmt.Printf("\ttableShift = 14\n")
|
||||
fmt.Printf(")\n\n")
|
||||
|
||||
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals))
|
||||
fmt.Printf("// encodeX are the encoding tables from Unicode to JIS code,\n")
|
||||
fmt.Printf("// sorted by decreasing length.\n")
|
||||
for i, v := range intervals {
|
||||
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high)
|
||||
}
|
||||
fmt.Printf("//\n")
|
||||
fmt.Printf("// The high two bits of the value record whether the JIS code comes from the\n")
|
||||
fmt.Printf("// JIS0208 table (high bits == 1) or the JIS0212 table (high bits == 2).\n")
|
||||
fmt.Printf("// The low 14 bits are two 7-bit unsigned integers j1 and j2 that form the\n")
|
||||
fmt.Printf("// JIS code (94*j1 + j2) within that table.\n")
|
||||
fmt.Printf("\n")
|
||||
|
||||
for i, v := range intervals {
|
||||
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
|
||||
fmt.Printf("var encode%d = [...]uint16{\n", i)
|
||||
for j := v.low; j < v.high; j++ {
|
||||
x := reverse[j]
|
||||
if x.table == -1 {
|
||||
continue
|
||||
}
|
||||
fmt.Printf("\t%d - %d: jis%s<<14 | 0x%02X<<7 | 0x%02X,\n",
|
||||
j, v.low, tables[x.table].name, x.jisCode/94, x.jisCode%94)
|
||||
}
|
||||
fmt.Printf("}\n\n")
|
||||
}
|
||||
}
|
||||
|
||||
// interval is a half-open interval [low, high).
|
||||
type interval struct {
|
||||
low, high int
|
||||
}
|
||||
|
||||
func (i interval) len() int { return i.high - i.low }
|
||||
|
||||
// byDecreasingLength sorts intervals by decreasing length.
|
||||
type byDecreasingLength []interval
|
||||
|
||||
func (b byDecreasingLength) Len() int { return len(b) }
|
||||
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
|
||||
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
|
||||
+189
@@ -0,0 +1,189 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package japanese
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/encoding"
|
||||
"golang.org/x/text/encoding/internal"
|
||||
"golang.org/x/text/encoding/internal/identifier"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// ShiftJIS is the Shift JIS encoding, also known as Code Page 932 and
|
||||
// Windows-31J.
|
||||
var ShiftJIS encoding.Encoding = &shiftJIS
|
||||
|
||||
var shiftJIS = internal.Encoding{
|
||||
&internal.SimpleEncoding{shiftJISDecoder{}, shiftJISEncoder{}},
|
||||
"Shift JIS",
|
||||
identifier.ShiftJIS,
|
||||
}
|
||||
|
||||
var errInvalidShiftJIS = errors.New("japanese: invalid Shift JIS encoding")
|
||||
|
||||
type shiftJISDecoder struct{ transform.NopResetter }
|
||||
|
||||
func (shiftJISDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
loop:
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
switch c0 := src[nSrc]; {
|
||||
case c0 < utf8.RuneSelf:
|
||||
r, size = rune(c0), 1
|
||||
|
||||
case 0xa1 <= c0 && c0 < 0xe0:
|
||||
r, size = rune(c0)+(0xff61-0xa1), 1
|
||||
|
||||
case (0x81 <= c0 && c0 < 0xa0) || (0xe0 <= c0 && c0 < 0xfd):
|
||||
if c0 <= 0x9f {
|
||||
c0 -= 0x70
|
||||
} else {
|
||||
c0 -= 0xb0
|
||||
}
|
||||
c0 = 2*c0 - 0x21
|
||||
|
||||
if nSrc+1 >= len(src) {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
}
|
||||
c1 := src[nSrc+1]
|
||||
switch {
|
||||
case c1 < 0x40:
|
||||
err = errInvalidShiftJIS
|
||||
break loop
|
||||
case c1 < 0x7f:
|
||||
c0--
|
||||
c1 -= 0x40
|
||||
case c1 == 0x7f:
|
||||
err = errInvalidShiftJIS
|
||||
break loop
|
||||
case c1 < 0x9f:
|
||||
c0--
|
||||
c1 -= 0x41
|
||||
case c1 < 0xfd:
|
||||
c1 -= 0x9f
|
||||
default:
|
||||
err = errInvalidShiftJIS
|
||||
break loop
|
||||
}
|
||||
r, size = '\ufffd', 2
|
||||
if i := int(c0)*94 + int(c1); i < len(jis0208Decode) {
|
||||
r = rune(jis0208Decode[i])
|
||||
if r == 0 {
|
||||
r = '\ufffd'
|
||||
}
|
||||
}
|
||||
|
||||
default:
|
||||
err = errInvalidShiftJIS
|
||||
break loop
|
||||
}
|
||||
|
||||
if nDst+utf8.RuneLen(r) > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
}
|
||||
nDst += utf8.EncodeRune(dst[nDst:], r)
|
||||
}
|
||||
if atEOF && err == transform.ErrShortSrc {
|
||||
err = errInvalidShiftJIS
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
type shiftJISEncoder struct{ transform.NopResetter }
|
||||
|
||||
func (shiftJISEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
loop:
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
r = rune(src[nSrc])
|
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf {
|
||||
size = 1
|
||||
|
||||
} else {
|
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:])
|
||||
if size == 1 {
|
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
}
|
||||
}
|
||||
|
||||
// func init checks that the switch covers all tables.
|
||||
switch {
|
||||
case encode0Low <= r && r < encode0High:
|
||||
if r = rune(encode0[r-encode0Low]); r>>tableShift == jis0208 {
|
||||
goto write2
|
||||
}
|
||||
case encode1Low <= r && r < encode1High:
|
||||
if r = rune(encode1[r-encode1Low]); r>>tableShift == jis0208 {
|
||||
goto write2
|
||||
}
|
||||
case encode2Low <= r && r < encode2High:
|
||||
if r = rune(encode2[r-encode2Low]); r>>tableShift == jis0208 {
|
||||
goto write2
|
||||
}
|
||||
case encode3Low <= r && r < encode3High:
|
||||
if r = rune(encode3[r-encode3Low]); r>>tableShift == jis0208 {
|
||||
goto write2
|
||||
}
|
||||
case encode4Low <= r && r < encode4High:
|
||||
if r = rune(encode4[r-encode4Low]); r>>tableShift == jis0208 {
|
||||
goto write2
|
||||
}
|
||||
case encode5Low <= r && r < encode5High:
|
||||
if 0xff61 <= r && r < 0xffa0 {
|
||||
r -= 0xff61 - 0xa1
|
||||
goto write1
|
||||
}
|
||||
if r = rune(encode5[r-encode5Low]); r>>tableShift == jis0208 {
|
||||
goto write2
|
||||
}
|
||||
}
|
||||
err = internal.ErrASCIIReplacement
|
||||
break
|
||||
}
|
||||
|
||||
write1:
|
||||
if nDst >= len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst] = uint8(r)
|
||||
nDst++
|
||||
continue
|
||||
|
||||
write2:
|
||||
j1 := uint8(r>>codeShift) & codeMask
|
||||
j2 := uint8(r) & codeMask
|
||||
if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
}
|
||||
if j1 <= 61 {
|
||||
dst[nDst+0] = 129 + j1/2
|
||||
} else {
|
||||
dst[nDst+0] = 193 + j1/2
|
||||
}
|
||||
if j1&1 == 0 {
|
||||
dst[nDst+1] = j2 + j2/63 + 64
|
||||
} else {
|
||||
dst[nDst+1] = j2 + 159
|
||||
}
|
||||
nDst += 2
|
||||
continue
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
+26971
File diff suppressed because it is too large
Load Diff
+178
@@ -0,0 +1,178 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package korean
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/encoding"
|
||||
"golang.org/x/text/encoding/internal"
|
||||
"golang.org/x/text/encoding/internal/identifier"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// All is a list of all defined encodings in this package.
|
||||
var All = []encoding.Encoding{EUCKR}
|
||||
|
||||
// EUCKR is the EUC-KR encoding, also known as Code Page 949.
|
||||
var EUCKR encoding.Encoding = &eucKR
|
||||
|
||||
var eucKR = internal.Encoding{
|
||||
&internal.SimpleEncoding{eucKRDecoder{}, eucKREncoder{}},
|
||||
"EUC-KR",
|
||||
identifier.EUCKR,
|
||||
}
|
||||
|
||||
var errInvalidEUCKR = errors.New("korean: invalid EUC-KR encoding")
|
||||
|
||||
type eucKRDecoder struct{ transform.NopResetter }
|
||||
|
||||
func (eucKRDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
loop:
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
switch c0 := src[nSrc]; {
|
||||
case c0 < utf8.RuneSelf:
|
||||
r, size = rune(c0), 1
|
||||
|
||||
case 0x81 <= c0 && c0 < 0xff:
|
||||
if nSrc+1 >= len(src) {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
}
|
||||
c1 := src[nSrc+1]
|
||||
if c0 < 0xc7 {
|
||||
r = 178 * rune(c0-0x81)
|
||||
switch {
|
||||
case 0x41 <= c1 && c1 < 0x5b:
|
||||
r += rune(c1) - (0x41 - 0*26)
|
||||
case 0x61 <= c1 && c1 < 0x7b:
|
||||
r += rune(c1) - (0x61 - 1*26)
|
||||
case 0x81 <= c1 && c1 < 0xff:
|
||||
r += rune(c1) - (0x81 - 2*26)
|
||||
default:
|
||||
err = errInvalidEUCKR
|
||||
break loop
|
||||
}
|
||||
} else if 0xa1 <= c1 && c1 < 0xff {
|
||||
r = 178*(0xc7-0x81) + rune(c0-0xc7)*94 + rune(c1-0xa1)
|
||||
} else {
|
||||
err = errInvalidEUCKR
|
||||
break loop
|
||||
}
|
||||
if int(r) < len(decode) {
|
||||
r = rune(decode[r])
|
||||
if r == 0 {
|
||||
r = '\ufffd'
|
||||
}
|
||||
} else {
|
||||
r = '\ufffd'
|
||||
}
|
||||
size = 2
|
||||
|
||||
default:
|
||||
err = errInvalidEUCKR
|
||||
break loop
|
||||
}
|
||||
|
||||
if nDst+utf8.RuneLen(r) > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
}
|
||||
nDst += utf8.EncodeRune(dst[nDst:], r)
|
||||
}
|
||||
if atEOF && err == transform.ErrShortSrc {
|
||||
err = errInvalidEUCKR
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
type eucKREncoder struct{ transform.NopResetter }
|
||||
|
||||
func (eucKREncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
r = rune(src[nSrc])
|
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf {
|
||||
size = 1
|
||||
|
||||
if nDst >= len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst] = uint8(r)
|
||||
nDst++
|
||||
continue
|
||||
|
||||
} else {
|
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:])
|
||||
if size == 1 {
|
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// func init checks that the switch covers all tables.
|
||||
switch {
|
||||
case encode0Low <= r && r < encode0High:
|
||||
if r = rune(encode0[r-encode0Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode1Low <= r && r < encode1High:
|
||||
if r = rune(encode1[r-encode1Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode2Low <= r && r < encode2High:
|
||||
if r = rune(encode2[r-encode2Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode3Low <= r && r < encode3High:
|
||||
if r = rune(encode3[r-encode3Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode4Low <= r && r < encode4High:
|
||||
if r = rune(encode4[r-encode4Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode5Low <= r && r < encode5High:
|
||||
if r = rune(encode5[r-encode5Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode6Low <= r && r < encode6High:
|
||||
if r = rune(encode6[r-encode6Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
}
|
||||
err = internal.ErrASCIIReplacement
|
||||
break
|
||||
}
|
||||
|
||||
write2:
|
||||
if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+0] = uint8(r >> 8)
|
||||
dst[nDst+1] = uint8(r)
|
||||
nDst += 2
|
||||
continue
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
// Check that the hard-coded encode switch covers all tables.
|
||||
if numEncodeTables != 7 {
|
||||
panic("bad numEncodeTables")
|
||||
}
|
||||
}
|
||||
+143
@@ -0,0 +1,143 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
// This program generates tables.go:
|
||||
// go run maketables.go | gofmt > tables.go
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func main() {
|
||||
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n")
|
||||
fmt.Printf("// Package korean provides Korean encodings such as EUC-KR.\n")
|
||||
fmt.Printf(`package korean // import "golang.org/x/text/encoding/korean"` + "\n\n")
|
||||
|
||||
res, err := http.Get("http://encoding.spec.whatwg.org/index-euc-kr.txt")
|
||||
if err != nil {
|
||||
log.Fatalf("Get: %v", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
mapping := [65536]uint16{}
|
||||
reverse := [65536]uint16{}
|
||||
|
||||
scanner := bufio.NewScanner(res.Body)
|
||||
for scanner.Scan() {
|
||||
s := strings.TrimSpace(scanner.Text())
|
||||
if s == "" || s[0] == '#' {
|
||||
continue
|
||||
}
|
||||
x, y := uint16(0), uint16(0)
|
||||
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
|
||||
log.Fatalf("could not parse %q", s)
|
||||
}
|
||||
if x < 0 || 178*(0xc7-0x81)+(0xfe-0xc7)*94+(0xff-0xa1) <= x {
|
||||
log.Fatalf("EUC-KR code %d is out of range", x)
|
||||
}
|
||||
mapping[x] = y
|
||||
if reverse[y] == 0 {
|
||||
c0, c1 := uint16(0), uint16(0)
|
||||
if x < 178*(0xc7-0x81) {
|
||||
c0 = uint16(x/178) + 0x81
|
||||
c1 = uint16(x % 178)
|
||||
switch {
|
||||
case c1 < 1*26:
|
||||
c1 += 0x41
|
||||
case c1 < 2*26:
|
||||
c1 += 0x47
|
||||
default:
|
||||
c1 += 0x4d
|
||||
}
|
||||
} else {
|
||||
x -= 178 * (0xc7 - 0x81)
|
||||
c0 = uint16(x/94) + 0xc7
|
||||
c1 = uint16(x%94) + 0xa1
|
||||
}
|
||||
reverse[y] = c0<<8 | c1
|
||||
}
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
log.Fatalf("scanner error: %v", err)
|
||||
}
|
||||
|
||||
fmt.Printf("// decode is the decoding table from EUC-KR code to Unicode.\n")
|
||||
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-euc-kr.txt\n")
|
||||
fmt.Printf("var decode = [...]uint16{\n")
|
||||
for i, v := range mapping {
|
||||
if v != 0 {
|
||||
fmt.Printf("\t%d: 0x%04X,\n", i, v)
|
||||
}
|
||||
}
|
||||
fmt.Printf("}\n\n")
|
||||
|
||||
// Any run of at least separation continuous zero entries in the reverse map will
|
||||
// be a separate encode table.
|
||||
const separation = 1024
|
||||
|
||||
intervals := []interval(nil)
|
||||
low, high := -1, -1
|
||||
for i, v := range reverse {
|
||||
if v == 0 {
|
||||
continue
|
||||
}
|
||||
if low < 0 {
|
||||
low = i
|
||||
} else if i-high >= separation {
|
||||
if high >= 0 {
|
||||
intervals = append(intervals, interval{low, high})
|
||||
}
|
||||
low = i
|
||||
}
|
||||
high = i + 1
|
||||
}
|
||||
if high >= 0 {
|
||||
intervals = append(intervals, interval{low, high})
|
||||
}
|
||||
sort.Sort(byDecreasingLength(intervals))
|
||||
|
||||
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals))
|
||||
fmt.Printf("// encodeX are the encoding tables from Unicode to EUC-KR code,\n")
|
||||
fmt.Printf("// sorted by decreasing length.\n")
|
||||
for i, v := range intervals {
|
||||
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high)
|
||||
}
|
||||
fmt.Printf("\n")
|
||||
|
||||
for i, v := range intervals {
|
||||
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
|
||||
fmt.Printf("var encode%d = [...]uint16{\n", i)
|
||||
for j := v.low; j < v.high; j++ {
|
||||
x := reverse[j]
|
||||
if x == 0 {
|
||||
continue
|
||||
}
|
||||
fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x)
|
||||
}
|
||||
fmt.Printf("}\n\n")
|
||||
}
|
||||
}
|
||||
|
||||
// interval is a half-open interval [low, high).
|
||||
type interval struct {
|
||||
low, high int
|
||||
}
|
||||
|
||||
func (i interval) len() int { return i.high - i.low }
|
||||
|
||||
// byDecreasingLength sorts intervals by decreasing length.
|
||||
type byDecreasingLength []interval
|
||||
|
||||
func (b byDecreasingLength) Len() int { return len(b) }
|
||||
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
|
||||
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
|
||||
+34152
File diff suppressed because it is too large
Load Diff
+12
@@ -0,0 +1,12 @@
|
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package simplifiedchinese
|
||||
|
||||
import (
|
||||
"golang.org/x/text/encoding"
|
||||
)
|
||||
|
||||
// All is a list of all defined encodings in this package.
|
||||
var All = []encoding.Encoding{GB18030, GBK, HZGB2312}
|
||||
+281
@@ -0,0 +1,281 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package simplifiedchinese
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/encoding"
|
||||
"golang.org/x/text/encoding/internal"
|
||||
"golang.org/x/text/encoding/internal/identifier"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
var (
|
||||
// GB18030 is the GB18030 encoding.
|
||||
GB18030 encoding.Encoding = &gbk18030
|
||||
// GBK is the GBK encoding. It encodes an extension of the GB2312 character set
|
||||
// and is also known as Code Page 936.
|
||||
GBK encoding.Encoding = &gbk
|
||||
)
|
||||
|
||||
var gbk = internal.Encoding{
|
||||
&internal.SimpleEncoding{
|
||||
gbkDecoder{gb18030: false},
|
||||
gbkEncoder{gb18030: false},
|
||||
},
|
||||
"GBK",
|
||||
identifier.GBK,
|
||||
}
|
||||
|
||||
var gbk18030 = internal.Encoding{
|
||||
&internal.SimpleEncoding{
|
||||
gbkDecoder{gb18030: true},
|
||||
gbkEncoder{gb18030: true},
|
||||
},
|
||||
"GB18030",
|
||||
identifier.GB18030,
|
||||
}
|
||||
|
||||
var (
|
||||
errInvalidGB18030 = errors.New("simplifiedchinese: invalid GB18030 encoding")
|
||||
errInvalidGBK = errors.New("simplifiedchinese: invalid GBK encoding")
|
||||
)
|
||||
|
||||
type gbkDecoder struct {
|
||||
transform.NopResetter
|
||||
gb18030 bool
|
||||
}
|
||||
|
||||
func (d gbkDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
loop:
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
switch c0 := src[nSrc]; {
|
||||
case c0 < utf8.RuneSelf:
|
||||
r, size = rune(c0), 1
|
||||
|
||||
// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
|
||||
// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
|
||||
// says to treat "gbk" as Code Page 936.
|
||||
case c0 == 0x80:
|
||||
r, size = '€', 1
|
||||
|
||||
case c0 < 0xff:
|
||||
if nSrc+1 >= len(src) {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
}
|
||||
c1 := src[nSrc+1]
|
||||
switch {
|
||||
case 0x40 <= c1 && c1 < 0x7f:
|
||||
c1 -= 0x40
|
||||
case 0x80 <= c1 && c1 < 0xff:
|
||||
c1 -= 0x41
|
||||
case d.gb18030 && 0x30 <= c1 && c1 < 0x40:
|
||||
if nSrc+3 >= len(src) {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
}
|
||||
c2 := src[nSrc+2]
|
||||
if c2 < 0x81 || 0xff <= c2 {
|
||||
err = errInvalidGB18030
|
||||
break loop
|
||||
}
|
||||
c3 := src[nSrc+3]
|
||||
if c3 < 0x30 || 0x3a <= c3 {
|
||||
err = errInvalidGB18030
|
||||
break loop
|
||||
}
|
||||
size = 4
|
||||
r = ((rune(c0-0x81)*10+rune(c1-0x30))*126+rune(c2-0x81))*10 + rune(c3-0x30)
|
||||
if r < 39420 {
|
||||
i, j := 0, len(gb18030)
|
||||
for i < j {
|
||||
h := i + (j-i)/2
|
||||
if r >= rune(gb18030[h][0]) {
|
||||
i = h + 1
|
||||
} else {
|
||||
j = h
|
||||
}
|
||||
}
|
||||
dec := &gb18030[i-1]
|
||||
r += rune(dec[1]) - rune(dec[0])
|
||||
goto write
|
||||
}
|
||||
r -= 189000
|
||||
if 0 <= r && r < 0x100000 {
|
||||
r += 0x10000
|
||||
goto write
|
||||
}
|
||||
err = errInvalidGB18030
|
||||
break loop
|
||||
default:
|
||||
if d.gb18030 {
|
||||
err = errInvalidGB18030
|
||||
} else {
|
||||
err = errInvalidGBK
|
||||
}
|
||||
break loop
|
||||
}
|
||||
r, size = '\ufffd', 2
|
||||
if i := int(c0-0x81)*190 + int(c1); i < len(decode) {
|
||||
r = rune(decode[i])
|
||||
if r == 0 {
|
||||
r = '\ufffd'
|
||||
}
|
||||
}
|
||||
|
||||
default:
|
||||
if d.gb18030 {
|
||||
err = errInvalidGB18030
|
||||
} else {
|
||||
err = errInvalidGBK
|
||||
}
|
||||
break loop
|
||||
}
|
||||
|
||||
write:
|
||||
if nDst+utf8.RuneLen(r) > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
}
|
||||
nDst += utf8.EncodeRune(dst[nDst:], r)
|
||||
}
|
||||
if atEOF && err == transform.ErrShortSrc {
|
||||
if d.gb18030 {
|
||||
err = errInvalidGB18030
|
||||
} else {
|
||||
err = errInvalidGBK
|
||||
}
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
type gbkEncoder struct {
|
||||
transform.NopResetter
|
||||
gb18030 bool
|
||||
}
|
||||
|
||||
func (e gbkEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, r2, size := rune(0), rune(0), 0
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
r = rune(src[nSrc])
|
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf {
|
||||
size = 1
|
||||
|
||||
} else {
|
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:])
|
||||
if size == 1 {
|
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// func init checks that the switch covers all tables.
|
||||
switch {
|
||||
case encode0Low <= r && r < encode0High:
|
||||
if r2 = rune(encode0[r-encode0Low]); r2 != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode1Low <= r && r < encode1High:
|
||||
// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
|
||||
// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
|
||||
// says to treat "gbk" as Code Page 936.
|
||||
if r == '€' {
|
||||
r = 0x80
|
||||
goto write1
|
||||
}
|
||||
if r2 = rune(encode1[r-encode1Low]); r2 != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode2Low <= r && r < encode2High:
|
||||
if r2 = rune(encode2[r-encode2Low]); r2 != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode3Low <= r && r < encode3High:
|
||||
if r2 = rune(encode3[r-encode3Low]); r2 != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode4Low <= r && r < encode4High:
|
||||
if r2 = rune(encode4[r-encode4Low]); r2 != 0 {
|
||||
goto write2
|
||||
}
|
||||
}
|
||||
|
||||
if e.gb18030 {
|
||||
if r < 0x10000 {
|
||||
i, j := 0, len(gb18030)
|
||||
for i < j {
|
||||
h := i + (j-i)/2
|
||||
if r >= rune(gb18030[h][1]) {
|
||||
i = h + 1
|
||||
} else {
|
||||
j = h
|
||||
}
|
||||
}
|
||||
dec := &gb18030[i-1]
|
||||
r += rune(dec[0]) - rune(dec[1])
|
||||
goto write4
|
||||
} else if r < 0x110000 {
|
||||
r += 189000 - 0x10000
|
||||
goto write4
|
||||
}
|
||||
}
|
||||
err = internal.ErrASCIIReplacement
|
||||
break
|
||||
}
|
||||
|
||||
write1:
|
||||
if nDst >= len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst] = uint8(r)
|
||||
nDst++
|
||||
continue
|
||||
|
||||
write2:
|
||||
if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+0] = uint8(r2 >> 8)
|
||||
dst[nDst+1] = uint8(r2)
|
||||
nDst += 2
|
||||
continue
|
||||
|
||||
write4:
|
||||
if nDst+4 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+3] = uint8(r%10 + 0x30)
|
||||
r /= 10
|
||||
dst[nDst+2] = uint8(r%126 + 0x81)
|
||||
r /= 126
|
||||
dst[nDst+1] = uint8(r%10 + 0x30)
|
||||
r /= 10
|
||||
dst[nDst+0] = uint8(r + 0x81)
|
||||
nDst += 4
|
||||
continue
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
// Check that the hard-coded encode switch covers all tables.
|
||||
if numEncodeTables != 5 {
|
||||
panic("bad numEncodeTables")
|
||||
}
|
||||
}
|
||||
+240
@@ -0,0 +1,240 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package simplifiedchinese
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/encoding"
|
||||
"golang.org/x/text/encoding/internal"
|
||||
"golang.org/x/text/encoding/internal/identifier"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// HZGB2312 is the HZ-GB2312 encoding.
|
||||
var HZGB2312 encoding.Encoding = &hzGB2312
|
||||
|
||||
var hzGB2312 = internal.Encoding{
|
||||
internal.FuncEncoding{hzGB2312NewDecoder, hzGB2312NewEncoder},
|
||||
"HZ-GB2312",
|
||||
identifier.HZGB2312,
|
||||
}
|
||||
|
||||
func hzGB2312NewDecoder() transform.Transformer {
|
||||
return new(hzGB2312Decoder)
|
||||
}
|
||||
|
||||
func hzGB2312NewEncoder() transform.Transformer {
|
||||
return new(hzGB2312Encoder)
|
||||
}
|
||||
|
||||
var errInvalidHZGB2312 = errors.New("simplifiedchinese: invalid HZ-GB2312 encoding")
|
||||
|
||||
const (
|
||||
asciiState = iota
|
||||
gbState
|
||||
)
|
||||
|
||||
type hzGB2312Decoder int
|
||||
|
||||
func (d *hzGB2312Decoder) Reset() {
|
||||
*d = asciiState
|
||||
}
|
||||
|
||||
func (d *hzGB2312Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
loop:
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
c0 := src[nSrc]
|
||||
if c0 >= utf8.RuneSelf {
|
||||
err = errInvalidHZGB2312
|
||||
break loop
|
||||
}
|
||||
|
||||
if c0 == '~' {
|
||||
if nSrc+1 >= len(src) {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
}
|
||||
size = 2
|
||||
switch src[nSrc+1] {
|
||||
case '{':
|
||||
*d = gbState
|
||||
continue
|
||||
case '}':
|
||||
*d = asciiState
|
||||
continue
|
||||
case '~':
|
||||
if nDst >= len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
}
|
||||
dst[nDst] = '~'
|
||||
nDst++
|
||||
continue
|
||||
case '\n':
|
||||
continue
|
||||
default:
|
||||
err = errInvalidHZGB2312
|
||||
break loop
|
||||
}
|
||||
}
|
||||
|
||||
if *d == asciiState {
|
||||
r, size = rune(c0), 1
|
||||
} else {
|
||||
if nSrc+1 >= len(src) {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
}
|
||||
c1 := src[nSrc+1]
|
||||
if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 {
|
||||
err = errInvalidHZGB2312
|
||||
break loop
|
||||
}
|
||||
|
||||
r, size = '\ufffd', 2
|
||||
if i := int(c0-0x01)*190 + int(c1+0x3f); i < len(decode) {
|
||||
r = rune(decode[i])
|
||||
if r == 0 {
|
||||
r = '\ufffd'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if nDst+utf8.RuneLen(r) > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
}
|
||||
nDst += utf8.EncodeRune(dst[nDst:], r)
|
||||
}
|
||||
if atEOF && err == transform.ErrShortSrc {
|
||||
err = errInvalidHZGB2312
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
type hzGB2312Encoder int
|
||||
|
||||
func (d *hzGB2312Encoder) Reset() {
|
||||
*d = asciiState
|
||||
}
|
||||
|
||||
func (e *hzGB2312Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
r = rune(src[nSrc])
|
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf {
|
||||
size = 1
|
||||
if r == '~' {
|
||||
if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+0] = '~'
|
||||
dst[nDst+1] = '~'
|
||||
nDst += 2
|
||||
continue
|
||||
} else if *e != asciiState {
|
||||
if nDst+3 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
*e = asciiState
|
||||
dst[nDst+0] = '~'
|
||||
dst[nDst+1] = '}'
|
||||
nDst += 2
|
||||
} else if nDst >= len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst] = uint8(r)
|
||||
nDst += 1
|
||||
continue
|
||||
|
||||
}
|
||||
|
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:])
|
||||
if size == 1 {
|
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// func init checks that the switch covers all tables.
|
||||
switch {
|
||||
case encode0Low <= r && r < encode0High:
|
||||
if r = rune(encode0[r-encode0Low]); r != 0 {
|
||||
goto writeGB
|
||||
}
|
||||
case encode1Low <= r && r < encode1High:
|
||||
if r = rune(encode1[r-encode1Low]); r != 0 {
|
||||
goto writeGB
|
||||
}
|
||||
case encode2Low <= r && r < encode2High:
|
||||
if r = rune(encode2[r-encode2Low]); r != 0 {
|
||||
goto writeGB
|
||||
}
|
||||
case encode3Low <= r && r < encode3High:
|
||||
if r = rune(encode3[r-encode3Low]); r != 0 {
|
||||
goto writeGB
|
||||
}
|
||||
case encode4Low <= r && r < encode4High:
|
||||
if r = rune(encode4[r-encode4Low]); r != 0 {
|
||||
goto writeGB
|
||||
}
|
||||
}
|
||||
|
||||
terminateInASCIIState:
|
||||
// Switch back to ASCII state in case of error so that an ASCII
|
||||
// replacement character can be written in the correct state.
|
||||
if *e != asciiState {
|
||||
if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+0] = '~'
|
||||
dst[nDst+1] = '}'
|
||||
nDst += 2
|
||||
}
|
||||
err = internal.ErrASCIIReplacement
|
||||
break
|
||||
|
||||
writeGB:
|
||||
c0 := uint8(r>>8) - 0x80
|
||||
c1 := uint8(r) - 0x80
|
||||
if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 {
|
||||
goto terminateInASCIIState
|
||||
}
|
||||
if *e == asciiState {
|
||||
if nDst+4 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
*e = gbState
|
||||
dst[nDst+0] = '~'
|
||||
dst[nDst+1] = '{'
|
||||
nDst += 2
|
||||
} else if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+0] = c0
|
||||
dst[nDst+1] = c1
|
||||
nDst += 2
|
||||
continue
|
||||
}
|
||||
// TODO: should one always terminate in ASCII state to make it safe to
|
||||
// concatenate two HZ-GB2312-encoded strings?
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
+161
@@ -0,0 +1,161 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
// This program generates tables.go:
|
||||
// go run maketables.go | gofmt > tables.go
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func main() {
|
||||
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n")
|
||||
fmt.Printf("// Package simplifiedchinese provides Simplified Chinese encodings such as GBK.\n")
|
||||
fmt.Printf(`package simplifiedchinese // import "golang.org/x/text/encoding/simplifiedchinese"` + "\n\n")
|
||||
|
||||
printGB18030()
|
||||
printGBK()
|
||||
}
|
||||
|
||||
func printGB18030() {
|
||||
res, err := http.Get("http://encoding.spec.whatwg.org/index-gb18030.txt")
|
||||
if err != nil {
|
||||
log.Fatalf("Get: %v", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
fmt.Printf("// gb18030 is the table from http://encoding.spec.whatwg.org/index-gb18030.txt\n")
|
||||
fmt.Printf("var gb18030 = [...][2]uint16{\n")
|
||||
scanner := bufio.NewScanner(res.Body)
|
||||
for scanner.Scan() {
|
||||
s := strings.TrimSpace(scanner.Text())
|
||||
if s == "" || s[0] == '#' {
|
||||
continue
|
||||
}
|
||||
x, y := uint32(0), uint32(0)
|
||||
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
|
||||
log.Fatalf("could not parse %q", s)
|
||||
}
|
||||
if x < 0x10000 && y < 0x10000 {
|
||||
fmt.Printf("\t{0x%04x, 0x%04x},\n", x, y)
|
||||
}
|
||||
}
|
||||
fmt.Printf("}\n\n")
|
||||
}
|
||||
|
||||
func printGBK() {
|
||||
res, err := http.Get("http://encoding.spec.whatwg.org/index-gbk.txt")
|
||||
if err != nil {
|
||||
log.Fatalf("Get: %v", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
mapping := [65536]uint16{}
|
||||
reverse := [65536]uint16{}
|
||||
|
||||
scanner := bufio.NewScanner(res.Body)
|
||||
for scanner.Scan() {
|
||||
s := strings.TrimSpace(scanner.Text())
|
||||
if s == "" || s[0] == '#' {
|
||||
continue
|
||||
}
|
||||
x, y := uint16(0), uint16(0)
|
||||
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
|
||||
log.Fatalf("could not parse %q", s)
|
||||
}
|
||||
if x < 0 || 126*190 <= x {
|
||||
log.Fatalf("GBK code %d is out of range", x)
|
||||
}
|
||||
mapping[x] = y
|
||||
if reverse[y] == 0 {
|
||||
c0, c1 := x/190, x%190
|
||||
if c1 >= 0x3f {
|
||||
c1++
|
||||
}
|
||||
reverse[y] = (0x81+c0)<<8 | (0x40 + c1)
|
||||
}
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
log.Fatalf("scanner error: %v", err)
|
||||
}
|
||||
|
||||
fmt.Printf("// decode is the decoding table from GBK code to Unicode.\n")
|
||||
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-gbk.txt\n")
|
||||
fmt.Printf("var decode = [...]uint16{\n")
|
||||
for i, v := range mapping {
|
||||
if v != 0 {
|
||||
fmt.Printf("\t%d: 0x%04X,\n", i, v)
|
||||
}
|
||||
}
|
||||
fmt.Printf("}\n\n")
|
||||
|
||||
// Any run of at least separation continuous zero entries in the reverse map will
|
||||
// be a separate encode table.
|
||||
const separation = 1024
|
||||
|
||||
intervals := []interval(nil)
|
||||
low, high := -1, -1
|
||||
for i, v := range reverse {
|
||||
if v == 0 {
|
||||
continue
|
||||
}
|
||||
if low < 0 {
|
||||
low = i
|
||||
} else if i-high >= separation {
|
||||
if high >= 0 {
|
||||
intervals = append(intervals, interval{low, high})
|
||||
}
|
||||
low = i
|
||||
}
|
||||
high = i + 1
|
||||
}
|
||||
if high >= 0 {
|
||||
intervals = append(intervals, interval{low, high})
|
||||
}
|
||||
sort.Sort(byDecreasingLength(intervals))
|
||||
|
||||
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals))
|
||||
fmt.Printf("// encodeX are the encoding tables from Unicode to GBK code,\n")
|
||||
fmt.Printf("// sorted by decreasing length.\n")
|
||||
for i, v := range intervals {
|
||||
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high)
|
||||
}
|
||||
fmt.Printf("\n")
|
||||
|
||||
for i, v := range intervals {
|
||||
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
|
||||
fmt.Printf("var encode%d = [...]uint16{\n", i)
|
||||
for j := v.low; j < v.high; j++ {
|
||||
x := reverse[j]
|
||||
if x == 0 {
|
||||
continue
|
||||
}
|
||||
fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x)
|
||||
}
|
||||
fmt.Printf("}\n\n")
|
||||
}
|
||||
}
|
||||
|
||||
// interval is a half-open interval [low, high).
|
||||
type interval struct {
|
||||
low, high int
|
||||
}
|
||||
|
||||
func (i interval) len() int { return i.high - i.low }
|
||||
|
||||
// byDecreasingLength sorts intervals by decreasing length.
|
||||
type byDecreasingLength []interval
|
||||
|
||||
func (b byDecreasingLength) Len() int { return len(b) }
|
||||
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
|
||||
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
|
||||
+43999
File diff suppressed because it is too large
Load Diff
+198
@@ -0,0 +1,198 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package traditionalchinese
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/encoding"
|
||||
"golang.org/x/text/encoding/internal"
|
||||
"golang.org/x/text/encoding/internal/identifier"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// All is a list of all defined encodings in this package.
|
||||
var All = []encoding.Encoding{Big5}
|
||||
|
||||
// Big5 is the Big5 encoding, also known as Code Page 950.
|
||||
var Big5 encoding.Encoding = &big5
|
||||
|
||||
var big5 = internal.Encoding{
|
||||
&internal.SimpleEncoding{big5Decoder{}, big5Encoder{}},
|
||||
"Big5",
|
||||
identifier.Big5,
|
||||
}
|
||||
|
||||
var errInvalidBig5 = errors.New("traditionalchinese: invalid Big5 encoding")
|
||||
|
||||
type big5Decoder struct{ transform.NopResetter }
|
||||
|
||||
func (big5Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size, s := rune(0), 0, ""
|
||||
loop:
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
switch c0 := src[nSrc]; {
|
||||
case c0 < utf8.RuneSelf:
|
||||
r, size = rune(c0), 1
|
||||
|
||||
case 0x81 <= c0 && c0 < 0xff:
|
||||
if nSrc+1 >= len(src) {
|
||||
err = transform.ErrShortSrc
|
||||
break loop
|
||||
}
|
||||
c1 := src[nSrc+1]
|
||||
switch {
|
||||
case 0x40 <= c1 && c1 < 0x7f:
|
||||
c1 -= 0x40
|
||||
case 0xa1 <= c1 && c1 < 0xff:
|
||||
c1 -= 0x62
|
||||
default:
|
||||
err = errInvalidBig5
|
||||
break loop
|
||||
}
|
||||
r, size = '\ufffd', 2
|
||||
if i := int(c0-0x81)*157 + int(c1); i < len(decode) {
|
||||
if 1133 <= i && i < 1167 {
|
||||
// The two-rune special cases for LATIN CAPITAL / SMALL E WITH CIRCUMFLEX
|
||||
// AND MACRON / CARON are from http://encoding.spec.whatwg.org/#big5
|
||||
switch i {
|
||||
case 1133:
|
||||
s = "\u00CA\u0304"
|
||||
goto writeStr
|
||||
case 1135:
|
||||
s = "\u00CA\u030C"
|
||||
goto writeStr
|
||||
case 1164:
|
||||
s = "\u00EA\u0304"
|
||||
goto writeStr
|
||||
case 1166:
|
||||
s = "\u00EA\u030C"
|
||||
goto writeStr
|
||||
}
|
||||
}
|
||||
r = rune(decode[i])
|
||||
if r == 0 {
|
||||
r = '\ufffd'
|
||||
}
|
||||
}
|
||||
|
||||
default:
|
||||
err = errInvalidBig5
|
||||
break loop
|
||||
}
|
||||
|
||||
if nDst+utf8.RuneLen(r) > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
}
|
||||
nDst += utf8.EncodeRune(dst[nDst:], r)
|
||||
continue loop
|
||||
|
||||
writeStr:
|
||||
if nDst+len(s) > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break loop
|
||||
}
|
||||
nDst += copy(dst[nDst:], s)
|
||||
continue loop
|
||||
}
|
||||
if atEOF && err == transform.ErrShortSrc {
|
||||
err = errInvalidBig5
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
type big5Encoder struct{ transform.NopResetter }
|
||||
|
||||
func (big5Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
r, size := rune(0), 0
|
||||
for ; nSrc < len(src); nSrc += size {
|
||||
r = rune(src[nSrc])
|
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf {
|
||||
size = 1
|
||||
if nDst >= len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst] = uint8(r)
|
||||
nDst++
|
||||
continue
|
||||
|
||||
} else {
|
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:])
|
||||
if size == 1 {
|
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if r >= utf8.RuneSelf {
|
||||
// func init checks that the switch covers all tables.
|
||||
switch {
|
||||
case encode0Low <= r && r < encode0High:
|
||||
if r = rune(encode0[r-encode0Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode1Low <= r && r < encode1High:
|
||||
if r = rune(encode1[r-encode1Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode2Low <= r && r < encode2High:
|
||||
if r = rune(encode2[r-encode2Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode3Low <= r && r < encode3High:
|
||||
if r = rune(encode3[r-encode3Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode4Low <= r && r < encode4High:
|
||||
if r = rune(encode4[r-encode4Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode5Low <= r && r < encode5High:
|
||||
if r = rune(encode5[r-encode5Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode6Low <= r && r < encode6High:
|
||||
if r = rune(encode6[r-encode6Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
case encode7Low <= r && r < encode7High:
|
||||
if r = rune(encode7[r-encode7Low]); r != 0 {
|
||||
goto write2
|
||||
}
|
||||
}
|
||||
err = internal.ErrASCIIReplacement
|
||||
break
|
||||
}
|
||||
|
||||
write2:
|
||||
if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+0] = uint8(r >> 8)
|
||||
dst[nDst+1] = uint8(r)
|
||||
nDst += 2
|
||||
continue
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
// Check that the hard-coded encode switch covers all tables.
|
||||
if numEncodeTables != 8 {
|
||||
panic("bad numEncodeTables")
|
||||
}
|
||||
}
|
||||
+140
@@ -0,0 +1,140 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
// This program generates tables.go:
|
||||
// go run maketables.go | gofmt > tables.go
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func main() {
|
||||
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n")
|
||||
fmt.Printf("// Package traditionalchinese provides Traditional Chinese encodings such as Big5.\n")
|
||||
fmt.Printf(`package traditionalchinese // import "golang.org/x/text/encoding/traditionalchinese"` + "\n\n")
|
||||
|
||||
res, err := http.Get("http://encoding.spec.whatwg.org/index-big5.txt")
|
||||
if err != nil {
|
||||
log.Fatalf("Get: %v", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
mapping := [65536]uint32{}
|
||||
reverse := [65536 * 4]uint16{}
|
||||
|
||||
scanner := bufio.NewScanner(res.Body)
|
||||
for scanner.Scan() {
|
||||
s := strings.TrimSpace(scanner.Text())
|
||||
if s == "" || s[0] == '#' {
|
||||
continue
|
||||
}
|
||||
x, y := uint16(0), uint32(0)
|
||||
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
|
||||
log.Fatalf("could not parse %q", s)
|
||||
}
|
||||
if x < 0 || 126*157 <= x {
|
||||
log.Fatalf("Big5 code %d is out of range", x)
|
||||
}
|
||||
mapping[x] = y
|
||||
|
||||
// The WHATWG spec http://encoding.spec.whatwg.org/#indexes says that
|
||||
// "The index pointer for code point in index is the first pointer
|
||||
// corresponding to code point in index", which would normally mean
|
||||
// that the code below should be guarded by "if reverse[y] == 0", but
|
||||
// last instead of first seems to match the behavior of
|
||||
// "iconv -f UTF-8 -t BIG5". For example, U+8005 者 occurs twice in
|
||||
// http://encoding.spec.whatwg.org/index-big5.txt, as index 2148
|
||||
// (encoded as "\x8e\xcd") and index 6543 (encoded as "\xaa\xcc")
|
||||
// and "echo 者 | iconv -f UTF-8 -t BIG5 | xxd" gives "\xaa\xcc".
|
||||
c0, c1 := x/157, x%157
|
||||
if c1 < 0x3f {
|
||||
c1 += 0x40
|
||||
} else {
|
||||
c1 += 0x62
|
||||
}
|
||||
reverse[y] = (0x81+c0)<<8 | c1
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
log.Fatalf("scanner error: %v", err)
|
||||
}
|
||||
|
||||
fmt.Printf("// decode is the decoding table from Big5 code to Unicode.\n")
|
||||
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-big5.txt\n")
|
||||
fmt.Printf("var decode = [...]uint32{\n")
|
||||
for i, v := range mapping {
|
||||
if v != 0 {
|
||||
fmt.Printf("\t%d: 0x%08X,\n", i, v)
|
||||
}
|
||||
}
|
||||
fmt.Printf("}\n\n")
|
||||
|
||||
// Any run of at least separation continuous zero entries in the reverse map will
|
||||
// be a separate encode table.
|
||||
const separation = 1024
|
||||
|
||||
intervals := []interval(nil)
|
||||
low, high := -1, -1
|
||||
for i, v := range reverse {
|
||||
if v == 0 {
|
||||
continue
|
||||
}
|
||||
if low < 0 {
|
||||
low = i
|
||||
} else if i-high >= separation {
|
||||
if high >= 0 {
|
||||
intervals = append(intervals, interval{low, high})
|
||||
}
|
||||
low = i
|
||||
}
|
||||
high = i + 1
|
||||
}
|
||||
if high >= 0 {
|
||||
intervals = append(intervals, interval{low, high})
|
||||
}
|
||||
sort.Sort(byDecreasingLength(intervals))
|
||||
|
||||
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals))
|
||||
fmt.Printf("// encodeX are the encoding tables from Unicode to Big5 code,\n")
|
||||
fmt.Printf("// sorted by decreasing length.\n")
|
||||
for i, v := range intervals {
|
||||
fmt.Printf("// encode%d: %5d entries for runes in [%6d, %6d).\n", i, v.len(), v.low, v.high)
|
||||
}
|
||||
fmt.Printf("\n")
|
||||
|
||||
for i, v := range intervals {
|
||||
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
|
||||
fmt.Printf("var encode%d = [...]uint16{\n", i)
|
||||
for j := v.low; j < v.high; j++ {
|
||||
x := reverse[j]
|
||||
if x == 0 {
|
||||
continue
|
||||
}
|
||||
fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x)
|
||||
}
|
||||
fmt.Printf("}\n\n")
|
||||
}
|
||||
}
|
||||
|
||||
// interval is a half-open interval [low, high).
|
||||
type interval struct {
|
||||
low, high int
|
||||
}
|
||||
|
||||
func (i interval) len() int { return i.high - i.low }
|
||||
|
||||
// byDecreasingLength sorts intervals by decreasing length.
|
||||
type byDecreasingLength []interval
|
||||
|
||||
func (b byDecreasingLength) Len() int { return len(b) }
|
||||
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
|
||||
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
|
||||
+37142
File diff suppressed because it is too large
Load Diff
+82
@@ -0,0 +1,82 @@
|
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package unicode
|
||||
|
||||
import (
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// BOMOverride returns a new decoder transformer that is identical to fallback,
|
||||
// except that the presence of a Byte Order Mark at the start of the input
|
||||
// causes it to switch to the corresponding Unicode decoding. It will only
|
||||
// consider BOMs for UTF-8, UTF-16BE, and UTF-16LE.
|
||||
//
|
||||
// This differs from using ExpectBOM by allowing a BOM to switch to UTF-8, not
|
||||
// just UTF-16 variants, and allowing falling back to any encoding scheme.
|
||||
//
|
||||
// This technique is recommended by the W3C for use in HTML 5: "For
|
||||
// compatibility with deployed content, the byte order mark (also known as BOM)
|
||||
// is considered more authoritative than anything else."
|
||||
// http://www.w3.org/TR/encoding/#specification-hooks
|
||||
//
|
||||
// Using BOMOverride is mostly intended for use cases where the first characters
|
||||
// of a fallback encoding are known to not be a BOM, for example, for valid HTML
|
||||
// and most encodings.
|
||||
func BOMOverride(fallback transform.Transformer) transform.Transformer {
|
||||
// TODO: possibly allow a variadic argument of unicode encodings to allow
|
||||
// specifying details of which fallbacks are supported as well as
|
||||
// specifying the details of the implementations. This would also allow for
|
||||
// support for UTF-32, which should not be supported by default.
|
||||
return &bomOverride{fallback: fallback}
|
||||
}
|
||||
|
||||
type bomOverride struct {
|
||||
fallback transform.Transformer
|
||||
current transform.Transformer
|
||||
}
|
||||
|
||||
func (d *bomOverride) Reset() {
|
||||
d.current = nil
|
||||
d.fallback.Reset()
|
||||
}
|
||||
|
||||
var (
|
||||
// TODO: we could use decode functions here, instead of allocating a new
|
||||
// decoder on every NewDecoder as IgnoreBOM decoders can be stateless.
|
||||
utf16le = UTF16(LittleEndian, IgnoreBOM)
|
||||
utf16be = UTF16(BigEndian, IgnoreBOM)
|
||||
)
|
||||
|
||||
const utf8BOM = "\ufeff"
|
||||
|
||||
func (d *bomOverride) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
if d.current != nil {
|
||||
return d.current.Transform(dst, src, atEOF)
|
||||
}
|
||||
if len(src) < 3 && !atEOF {
|
||||
return 0, 0, transform.ErrShortSrc
|
||||
}
|
||||
d.current = d.fallback
|
||||
bomSize := 0
|
||||
if len(src) >= 2 {
|
||||
if src[0] == 0xFF && src[1] == 0xFE {
|
||||
d.current = utf16le.NewDecoder()
|
||||
bomSize = 2
|
||||
} else if src[0] == 0xFE && src[1] == 0xFF {
|
||||
d.current = utf16be.NewDecoder()
|
||||
bomSize = 2
|
||||
} else if len(src) >= 3 &&
|
||||
src[0] == utf8BOM[0] &&
|
||||
src[1] == utf8BOM[1] &&
|
||||
src[2] == utf8BOM[2] {
|
||||
d.current = transform.Nop
|
||||
bomSize = 3
|
||||
}
|
||||
}
|
||||
if bomSize < len(src) {
|
||||
nDst, nSrc, err = d.current.Transform(dst, src[bomSize:], atEOF)
|
||||
}
|
||||
return nDst, nSrc + bomSize, err
|
||||
}
|
||||
+434
@@ -0,0 +1,434 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package unicode provides Unicode encodings such as UTF-16.
|
||||
package unicode // import "golang.org/x/text/encoding/unicode"
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"unicode/utf16"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/encoding"
|
||||
"golang.org/x/text/encoding/internal"
|
||||
"golang.org/x/text/encoding/internal/identifier"
|
||||
"golang.org/x/text/internal/utf8internal"
|
||||
"golang.org/x/text/runes"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// TODO: I think the Transformers really should return errors on unmatched
|
||||
// surrogate pairs and odd numbers of bytes. This is not required by RFC 2781,
|
||||
// which leaves it open, but is suggested by WhatWG. It will allow for all error
|
||||
// modes as defined by WhatWG: fatal, HTML and Replacement. This would require
|
||||
// the introduction of some kind of error type for conveying the erroneous code
|
||||
// point.
|
||||
|
||||
// UTF8 is the UTF-8 encoding.
|
||||
var UTF8 encoding.Encoding = utf8enc
|
||||
|
||||
var utf8enc = &internal.Encoding{
|
||||
&internal.SimpleEncoding{utf8Decoder{}, runes.ReplaceIllFormed()},
|
||||
"UTF-8",
|
||||
identifier.UTF8,
|
||||
}
|
||||
|
||||
type utf8Decoder struct{ transform.NopResetter }
|
||||
|
||||
func (utf8Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
var pSrc int // point from which to start copy in src
|
||||
var accept utf8internal.AcceptRange
|
||||
|
||||
// The decoder can only make the input larger, not smaller.
|
||||
n := len(src)
|
||||
if len(dst) < n {
|
||||
err = transform.ErrShortDst
|
||||
n = len(dst)
|
||||
atEOF = false
|
||||
}
|
||||
for nSrc < n {
|
||||
c := src[nSrc]
|
||||
if c < utf8.RuneSelf {
|
||||
nSrc++
|
||||
continue
|
||||
}
|
||||
first := utf8internal.First[c]
|
||||
size := int(first & utf8internal.SizeMask)
|
||||
if first == utf8internal.FirstInvalid {
|
||||
goto handleInvalid // invalid starter byte
|
||||
}
|
||||
accept = utf8internal.AcceptRanges[first>>utf8internal.AcceptShift]
|
||||
if nSrc+size > n {
|
||||
if !atEOF {
|
||||
// We may stop earlier than necessary here if the short sequence
|
||||
// has invalid bytes. Not checking for this simplifies the code
|
||||
// and may avoid duplicate computations in certain conditions.
|
||||
if err == nil {
|
||||
err = transform.ErrShortSrc
|
||||
}
|
||||
break
|
||||
}
|
||||
// Determine the maximal subpart of an ill-formed subsequence.
|
||||
switch {
|
||||
case nSrc+1 >= n || src[nSrc+1] < accept.Lo || accept.Hi < src[nSrc+1]:
|
||||
size = 1
|
||||
case nSrc+2 >= n || src[nSrc+2] < utf8internal.LoCB || utf8internal.HiCB < src[nSrc+2]:
|
||||
size = 2
|
||||
default:
|
||||
size = 3 // As we are short, the maximum is 3.
|
||||
}
|
||||
goto handleInvalid
|
||||
}
|
||||
if c = src[nSrc+1]; c < accept.Lo || accept.Hi < c {
|
||||
size = 1
|
||||
goto handleInvalid // invalid continuation byte
|
||||
} else if size == 2 {
|
||||
} else if c = src[nSrc+2]; c < utf8internal.LoCB || utf8internal.HiCB < c {
|
||||
size = 2
|
||||
goto handleInvalid // invalid continuation byte
|
||||
} else if size == 3 {
|
||||
} else if c = src[nSrc+3]; c < utf8internal.LoCB || utf8internal.HiCB < c {
|
||||
size = 3
|
||||
goto handleInvalid // invalid continuation byte
|
||||
}
|
||||
nSrc += size
|
||||
continue
|
||||
|
||||
handleInvalid:
|
||||
// Copy the scanned input so far.
|
||||
nDst += copy(dst[nDst:], src[pSrc:nSrc])
|
||||
|
||||
// Append RuneError to the destination.
|
||||
const runeError = "\ufffd"
|
||||
if nDst+len(runeError) > len(dst) {
|
||||
return nDst, nSrc, transform.ErrShortDst
|
||||
}
|
||||
nDst += copy(dst[nDst:], runeError)
|
||||
|
||||
// Skip the maximal subpart of an ill-formed subsequence according to
|
||||
// the W3C standard way instead of the Go way. This Transform is
|
||||
// probably the only place in the text repo where it is warranted.
|
||||
nSrc += size
|
||||
pSrc = nSrc
|
||||
|
||||
// Recompute the maximum source length.
|
||||
if sz := len(dst) - nDst; sz < len(src)-nSrc {
|
||||
err = transform.ErrShortDst
|
||||
n = nSrc + sz
|
||||
atEOF = false
|
||||
}
|
||||
}
|
||||
return nDst + copy(dst[nDst:], src[pSrc:nSrc]), nSrc, err
|
||||
}
|
||||
|
||||
// UTF16 returns a UTF-16 Encoding for the given default endianness and byte
|
||||
// order mark (BOM) policy.
|
||||
//
|
||||
// When decoding from UTF-16 to UTF-8, if the BOMPolicy is IgnoreBOM then
|
||||
// neither BOMs U+FEFF nor noncharacters U+FFFE in the input stream will affect
|
||||
// the endianness used for decoding, and will instead be output as their
|
||||
// standard UTF-8 encodings: "\xef\xbb\xbf" and "\xef\xbf\xbe". If the BOMPolicy
|
||||
// is UseBOM or ExpectBOM a staring BOM is not written to the UTF-8 output.
|
||||
// Instead, it overrides the default endianness e for the remainder of the
|
||||
// transformation. Any subsequent BOMs U+FEFF or noncharacters U+FFFE will not
|
||||
// affect the endianness used, and will instead be output as their standard
|
||||
// UTF-8 encodings. For UseBOM, if there is no starting BOM, it will proceed
|
||||
// with the default Endianness. For ExpectBOM, in that case, the transformation
|
||||
// will return early with an ErrMissingBOM error.
|
||||
//
|
||||
// When encoding from UTF-8 to UTF-16, a BOM will be inserted at the start of
|
||||
// the output if the BOMPolicy is UseBOM or ExpectBOM. Otherwise, a BOM will not
|
||||
// be inserted. The UTF-8 input does not need to contain a BOM.
|
||||
//
|
||||
// There is no concept of a 'native' endianness. If the UTF-16 data is produced
|
||||
// and consumed in a greater context that implies a certain endianness, use
|
||||
// IgnoreBOM. Otherwise, use ExpectBOM and always produce and consume a BOM.
|
||||
//
|
||||
// In the language of http://www.unicode.org/faq/utf_bom.html#bom10, IgnoreBOM
|
||||
// corresponds to "Where the precise type of the data stream is known... the
|
||||
// BOM should not be used" and ExpectBOM corresponds to "A particular
|
||||
// protocol... may require use of the BOM".
|
||||
func UTF16(e Endianness, b BOMPolicy) encoding.Encoding {
|
||||
return utf16Encoding{config{e, b}, mibValue[e][b&bomMask]}
|
||||
}
|
||||
|
||||
// mibValue maps Endianness and BOMPolicy settings to MIB constants. Note that
|
||||
// some configurations map to the same MIB identifier. RFC 2781 has requirements
|
||||
// and recommendations. Some of the "configurations" are merely recommendations,
|
||||
// so multiple configurations could match.
|
||||
var mibValue = map[Endianness][numBOMValues]identifier.MIB{
|
||||
BigEndian: [numBOMValues]identifier.MIB{
|
||||
IgnoreBOM: identifier.UTF16BE,
|
||||
UseBOM: identifier.UTF16, // BigEnding default is preferred by RFC 2781.
|
||||
// TODO: acceptBOM | strictBOM would map to UTF16BE as well.
|
||||
},
|
||||
LittleEndian: [numBOMValues]identifier.MIB{
|
||||
IgnoreBOM: identifier.UTF16LE,
|
||||
UseBOM: identifier.UTF16, // LittleEndian default is allowed and preferred on Windows.
|
||||
// TODO: acceptBOM | strictBOM would map to UTF16LE as well.
|
||||
},
|
||||
// ExpectBOM is not widely used and has no valid MIB identifier.
|
||||
}
|
||||
|
||||
// All lists a configuration for each IANA-defined UTF-16 variant.
|
||||
var All = []encoding.Encoding{
|
||||
UTF8,
|
||||
UTF16(BigEndian, UseBOM),
|
||||
UTF16(BigEndian, IgnoreBOM),
|
||||
UTF16(LittleEndian, IgnoreBOM),
|
||||
}
|
||||
|
||||
// BOMPolicy is a UTF-16 encoding's byte order mark policy.
|
||||
type BOMPolicy uint8
|
||||
|
||||
const (
|
||||
writeBOM BOMPolicy = 0x01
|
||||
acceptBOM BOMPolicy = 0x02
|
||||
requireBOM BOMPolicy = 0x04
|
||||
bomMask BOMPolicy = 0x07
|
||||
|
||||
// HACK: numBOMValues == 8 triggers a bug in the 1.4 compiler (cannot have a
|
||||
// map of an array of length 8 of a type that is also used as a key or value
|
||||
// in another map). See golang.org/issue/11354.
|
||||
// TODO: consider changing this value back to 8 if the use of 1.4.* has
|
||||
// been minimized.
|
||||
numBOMValues = 8 + 1
|
||||
|
||||
// IgnoreBOM means to ignore any byte order marks.
|
||||
IgnoreBOM BOMPolicy = 0
|
||||
// Common and RFC 2781-compliant interpretation for UTF-16BE/LE.
|
||||
|
||||
// UseBOM means that the UTF-16 form may start with a byte order mark, which
|
||||
// will be used to override the default encoding.
|
||||
UseBOM BOMPolicy = writeBOM | acceptBOM
|
||||
// Common and RFC 2781-compliant interpretation for UTF-16.
|
||||
|
||||
// ExpectBOM means that the UTF-16 form must start with a byte order mark,
|
||||
// which will be used to override the default encoding.
|
||||
ExpectBOM BOMPolicy = writeBOM | acceptBOM | requireBOM
|
||||
// Used in Java as Unicode (not to be confused with Java's UTF-16) and
|
||||
// ICU's UTF-16,version=1. Not compliant with RFC 2781.
|
||||
|
||||
// TODO (maybe): strictBOM: BOM must match Endianness. This would allow:
|
||||
// - UTF-16(B|L)E,version=1: writeBOM | acceptBOM | requireBOM | strictBOM
|
||||
// (UnicodeBig and UnicodeLittle in Java)
|
||||
// - RFC 2781-compliant, but less common interpretation for UTF-16(B|L)E:
|
||||
// acceptBOM | strictBOM (e.g. assigned to CheckBOM).
|
||||
// This addition would be consistent with supporting ExpectBOM.
|
||||
)
|
||||
|
||||
// Endianness is a UTF-16 encoding's default endianness.
|
||||
type Endianness bool
|
||||
|
||||
const (
|
||||
// BigEndian is UTF-16BE.
|
||||
BigEndian Endianness = false
|
||||
// LittleEndian is UTF-16LE.
|
||||
LittleEndian Endianness = true
|
||||
)
|
||||
|
||||
// ErrMissingBOM means that decoding UTF-16 input with ExpectBOM did not find a
|
||||
// starting byte order mark.
|
||||
var ErrMissingBOM = errors.New("encoding: missing byte order mark")
|
||||
|
||||
type utf16Encoding struct {
|
||||
config
|
||||
mib identifier.MIB
|
||||
}
|
||||
|
||||
type config struct {
|
||||
endianness Endianness
|
||||
bomPolicy BOMPolicy
|
||||
}
|
||||
|
||||
func (u utf16Encoding) NewDecoder() *encoding.Decoder {
|
||||
return &encoding.Decoder{Transformer: &utf16Decoder{
|
||||
initial: u.config,
|
||||
current: u.config,
|
||||
}}
|
||||
}
|
||||
|
||||
func (u utf16Encoding) NewEncoder() *encoding.Encoder {
|
||||
return &encoding.Encoder{Transformer: &utf16Encoder{
|
||||
endianness: u.endianness,
|
||||
initialBOMPolicy: u.bomPolicy,
|
||||
currentBOMPolicy: u.bomPolicy,
|
||||
}}
|
||||
}
|
||||
|
||||
func (u utf16Encoding) ID() (mib identifier.MIB, other string) {
|
||||
return u.mib, ""
|
||||
}
|
||||
|
||||
func (u utf16Encoding) String() string {
|
||||
e, b := "B", ""
|
||||
if u.endianness == LittleEndian {
|
||||
e = "L"
|
||||
}
|
||||
switch u.bomPolicy {
|
||||
case ExpectBOM:
|
||||
b = "Expect"
|
||||
case UseBOM:
|
||||
b = "Use"
|
||||
case IgnoreBOM:
|
||||
b = "Ignore"
|
||||
}
|
||||
return "UTF-16" + e + "E (" + b + " BOM)"
|
||||
}
|
||||
|
||||
type utf16Decoder struct {
|
||||
initial config
|
||||
current config
|
||||
}
|
||||
|
||||
func (u *utf16Decoder) Reset() {
|
||||
u.current = u.initial
|
||||
}
|
||||
|
||||
func (u *utf16Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
if len(src) == 0 {
|
||||
if atEOF && u.current.bomPolicy&requireBOM != 0 {
|
||||
return 0, 0, ErrMissingBOM
|
||||
}
|
||||
return 0, 0, nil
|
||||
}
|
||||
if u.current.bomPolicy&acceptBOM != 0 {
|
||||
if len(src) < 2 {
|
||||
return 0, 0, transform.ErrShortSrc
|
||||
}
|
||||
switch {
|
||||
case src[0] == 0xfe && src[1] == 0xff:
|
||||
u.current.endianness = BigEndian
|
||||
nSrc = 2
|
||||
case src[0] == 0xff && src[1] == 0xfe:
|
||||
u.current.endianness = LittleEndian
|
||||
nSrc = 2
|
||||
default:
|
||||
if u.current.bomPolicy&requireBOM != 0 {
|
||||
return 0, 0, ErrMissingBOM
|
||||
}
|
||||
}
|
||||
u.current.bomPolicy = IgnoreBOM
|
||||
}
|
||||
|
||||
var r rune
|
||||
var dSize, sSize int
|
||||
for nSrc < len(src) {
|
||||
if nSrc+1 < len(src) {
|
||||
x := uint16(src[nSrc+0])<<8 | uint16(src[nSrc+1])
|
||||
if u.current.endianness == LittleEndian {
|
||||
x = x>>8 | x<<8
|
||||
}
|
||||
r, sSize = rune(x), 2
|
||||
if utf16.IsSurrogate(r) {
|
||||
if nSrc+3 < len(src) {
|
||||
x = uint16(src[nSrc+2])<<8 | uint16(src[nSrc+3])
|
||||
if u.current.endianness == LittleEndian {
|
||||
x = x>>8 | x<<8
|
||||
}
|
||||
// Save for next iteration if it is not a high surrogate.
|
||||
if isHighSurrogate(rune(x)) {
|
||||
r, sSize = utf16.DecodeRune(r, rune(x)), 4
|
||||
}
|
||||
} else if !atEOF {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
}
|
||||
if dSize = utf8.RuneLen(r); dSize < 0 {
|
||||
r, dSize = utf8.RuneError, 3
|
||||
}
|
||||
} else if atEOF {
|
||||
// Single trailing byte.
|
||||
r, dSize, sSize = utf8.RuneError, 3, 1
|
||||
} else {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
if nDst+dSize > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
nDst += utf8.EncodeRune(dst[nDst:], r)
|
||||
nSrc += sSize
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
func isHighSurrogate(r rune) bool {
|
||||
return 0xDC00 <= r && r <= 0xDFFF
|
||||
}
|
||||
|
||||
type utf16Encoder struct {
|
||||
endianness Endianness
|
||||
initialBOMPolicy BOMPolicy
|
||||
currentBOMPolicy BOMPolicy
|
||||
}
|
||||
|
||||
func (u *utf16Encoder) Reset() {
|
||||
u.currentBOMPolicy = u.initialBOMPolicy
|
||||
}
|
||||
|
||||
func (u *utf16Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
if u.currentBOMPolicy&writeBOM != 0 {
|
||||
if len(dst) < 2 {
|
||||
return 0, 0, transform.ErrShortDst
|
||||
}
|
||||
dst[0], dst[1] = 0xfe, 0xff
|
||||
u.currentBOMPolicy = IgnoreBOM
|
||||
nDst = 2
|
||||
}
|
||||
|
||||
r, size := rune(0), 0
|
||||
for nSrc < len(src) {
|
||||
r = rune(src[nSrc])
|
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf {
|
||||
size = 1
|
||||
|
||||
} else {
|
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:])
|
||||
if size == 1 {
|
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if r <= 0xffff {
|
||||
if nDst+2 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+0] = uint8(r >> 8)
|
||||
dst[nDst+1] = uint8(r)
|
||||
nDst += 2
|
||||
} else {
|
||||
if nDst+4 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
r1, r2 := utf16.EncodeRune(r)
|
||||
dst[nDst+0] = uint8(r1 >> 8)
|
||||
dst[nDst+1] = uint8(r1)
|
||||
dst[nDst+2] = uint8(r2 >> 8)
|
||||
dst[nDst+3] = uint8(r2)
|
||||
nDst += 4
|
||||
}
|
||||
nSrc += size
|
||||
}
|
||||
|
||||
if u.endianness == LittleEndian {
|
||||
for i := 0; i < nDst; i += 2 {
|
||||
dst[i], dst[i+1] = dst[i+1], dst[i]
|
||||
}
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
+296
@@ -0,0 +1,296 @@
|
||||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package utf32 provides the UTF-32 Unicode encoding.
|
||||
//
|
||||
// Please note that support for UTF-32 is discouraged as it is a rare and
|
||||
// inefficient encoding, unfit for use as an interchange format. For use
|
||||
// on the web, the W3C strongly discourages its use
|
||||
// (https://www.w3.org/TR/html5/document-metadata.html#charset)
|
||||
// while WHATWG directly prohibits supporting it
|
||||
// (https://html.spec.whatwg.org/multipage/syntax.html#character-encodings).
|
||||
package utf32 // import "golang.org/x/text/encoding/unicode/utf32"
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/encoding"
|
||||
"golang.org/x/text/encoding/internal/identifier"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// All lists a configuration for each IANA-defined UTF-32 variant.
|
||||
var All = []encoding.Encoding{
|
||||
UTF32(BigEndian, UseBOM),
|
||||
UTF32(BigEndian, IgnoreBOM),
|
||||
UTF32(LittleEndian, IgnoreBOM),
|
||||
}
|
||||
|
||||
// ErrMissingBOM means that decoding UTF-32 input with ExpectBOM did not
|
||||
// find a starting byte order mark.
|
||||
var ErrMissingBOM = errors.New("encoding: missing byte order mark")
|
||||
|
||||
// UTF32 returns a UTF-32 Encoding for the given default endianness and
|
||||
// byte order mark (BOM) policy.
|
||||
//
|
||||
// When decoding from UTF-32 to UTF-8, if the BOMPolicy is IgnoreBOM then
|
||||
// neither BOMs U+FEFF nor ill-formed code units 0xFFFE0000 in the input
|
||||
// stream will affect the endianness used for decoding. Instead BOMs will
|
||||
// be output as their standard UTF-8 encoding "\xef\xbb\xbf" while
|
||||
// 0xFFFE0000 code units will be output as "\xef\xbf\xbd", the standard
|
||||
// UTF-8 encoding for the Unicode replacement character. If the BOMPolicy
|
||||
// is UseBOM or ExpectBOM a starting BOM is not written to the UTF-8
|
||||
// output. Instead, it overrides the default endianness e for the remainder
|
||||
// of the transformation. Any subsequent BOMs U+FEFF or ill-formed code
|
||||
// units 0xFFFE0000 will not affect the endianness used, and will instead
|
||||
// be output as their standard UTF-8 (replacement) encodings. For UseBOM,
|
||||
// if there is no starting BOM, it will proceed with the default
|
||||
// Endianness. For ExpectBOM, in that case, the transformation will return
|
||||
// early with an ErrMissingBOM error.
|
||||
//
|
||||
// When encoding from UTF-8 to UTF-32, a BOM will be inserted at the start
|
||||
// of the output if the BOMPolicy is UseBOM or ExpectBOM. Otherwise, a BOM
|
||||
// will not be inserted. The UTF-8 input does not need to contain a BOM.
|
||||
//
|
||||
// There is no concept of a 'native' endianness. If the UTF-32 data is
|
||||
// produced and consumed in a greater context that implies a certain
|
||||
// endianness, use IgnoreBOM. Otherwise, use ExpectBOM and always produce
|
||||
// and consume a BOM.
|
||||
//
|
||||
// In the language of http://www.unicode.org/faq/utf_bom.html#bom10,
|
||||
// IgnoreBOM corresponds to "Where the precise type of the data stream is
|
||||
// known... the BOM should not be used" and ExpectBOM corresponds to "A
|
||||
// particular protocol... may require use of the BOM".
|
||||
func UTF32(e Endianness, b BOMPolicy) encoding.Encoding {
|
||||
return utf32Encoding{config{e, b}, mibValue[e][b&bomMask]}
|
||||
}
|
||||
|
||||
// mibValue maps Endianness and BOMPolicy settings to MIB constants for UTF-32.
|
||||
// Note that some configurations map to the same MIB identifier.
|
||||
var mibValue = map[Endianness][numBOMValues]identifier.MIB{
|
||||
BigEndian: [numBOMValues]identifier.MIB{
|
||||
IgnoreBOM: identifier.UTF32BE,
|
||||
UseBOM: identifier.UTF32,
|
||||
},
|
||||
LittleEndian: [numBOMValues]identifier.MIB{
|
||||
IgnoreBOM: identifier.UTF32LE,
|
||||
UseBOM: identifier.UTF32,
|
||||
},
|
||||
// ExpectBOM is not widely used and has no valid MIB identifier.
|
||||
}
|
||||
|
||||
// BOMPolicy is a UTF-32 encodings's byte order mark policy.
|
||||
type BOMPolicy uint8
|
||||
|
||||
const (
|
||||
writeBOM BOMPolicy = 0x01
|
||||
acceptBOM BOMPolicy = 0x02
|
||||
requireBOM BOMPolicy = 0x04
|
||||
bomMask BOMPolicy = 0x07
|
||||
|
||||
// HACK: numBOMValues == 8 triggers a bug in the 1.4 compiler (cannot have a
|
||||
// map of an array of length 8 of a type that is also used as a key or value
|
||||
// in another map). See golang.org/issue/11354.
|
||||
// TODO: consider changing this value back to 8 if the use of 1.4.* has
|
||||
// been minimized.
|
||||
numBOMValues = 8 + 1
|
||||
|
||||
// IgnoreBOM means to ignore any byte order marks.
|
||||
IgnoreBOM BOMPolicy = 0
|
||||
// Unicode-compliant interpretation for UTF-32BE/LE.
|
||||
|
||||
// UseBOM means that the UTF-32 form may start with a byte order mark,
|
||||
// which will be used to override the default encoding.
|
||||
UseBOM BOMPolicy = writeBOM | acceptBOM
|
||||
// Unicode-compliant interpretation for UTF-32.
|
||||
|
||||
// ExpectBOM means that the UTF-32 form must start with a byte order mark,
|
||||
// which will be used to override the default encoding.
|
||||
ExpectBOM BOMPolicy = writeBOM | acceptBOM | requireBOM
|
||||
// Consistent with BOMPolicy definition in golang.org/x/text/encoding/unicode
|
||||
)
|
||||
|
||||
// Endianness is a UTF-32 encoding's default endianness.
|
||||
type Endianness bool
|
||||
|
||||
const (
|
||||
// BigEndian is UTF-32BE.
|
||||
BigEndian Endianness = false
|
||||
// LittleEndian is UTF-32LE.
|
||||
LittleEndian Endianness = true
|
||||
)
|
||||
|
||||
type config struct {
|
||||
endianness Endianness
|
||||
bomPolicy BOMPolicy
|
||||
}
|
||||
|
||||
type utf32Encoding struct {
|
||||
config
|
||||
mib identifier.MIB
|
||||
}
|
||||
|
||||
func (u utf32Encoding) NewDecoder() *encoding.Decoder {
|
||||
return &encoding.Decoder{Transformer: &utf32Decoder{
|
||||
initial: u.config,
|
||||
current: u.config,
|
||||
}}
|
||||
}
|
||||
|
||||
func (u utf32Encoding) NewEncoder() *encoding.Encoder {
|
||||
return &encoding.Encoder{Transformer: &utf32Encoder{
|
||||
endianness: u.endianness,
|
||||
initialBOMPolicy: u.bomPolicy,
|
||||
currentBOMPolicy: u.bomPolicy,
|
||||
}}
|
||||
}
|
||||
|
||||
func (u utf32Encoding) ID() (mib identifier.MIB, other string) {
|
||||
return u.mib, ""
|
||||
}
|
||||
|
||||
func (u utf32Encoding) String() string {
|
||||
e, b := "B", ""
|
||||
if u.endianness == LittleEndian {
|
||||
e = "L"
|
||||
}
|
||||
switch u.bomPolicy {
|
||||
case ExpectBOM:
|
||||
b = "Expect"
|
||||
case UseBOM:
|
||||
b = "Use"
|
||||
case IgnoreBOM:
|
||||
b = "Ignore"
|
||||
}
|
||||
return "UTF-32" + e + "E (" + b + " BOM)"
|
||||
}
|
||||
|
||||
type utf32Decoder struct {
|
||||
initial config
|
||||
current config
|
||||
}
|
||||
|
||||
func (u *utf32Decoder) Reset() {
|
||||
u.current = u.initial
|
||||
}
|
||||
|
||||
func (u *utf32Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
if len(src) == 0 {
|
||||
if atEOF && u.current.bomPolicy&requireBOM != 0 {
|
||||
return 0, 0, ErrMissingBOM
|
||||
}
|
||||
return 0, 0, nil
|
||||
}
|
||||
if u.current.bomPolicy&acceptBOM != 0 {
|
||||
if len(src) < 4 {
|
||||
return 0, 0, transform.ErrShortSrc
|
||||
}
|
||||
switch {
|
||||
case src[0] == 0x00 && src[1] == 0x00 && src[2] == 0xfe && src[3] == 0xff:
|
||||
u.current.endianness = BigEndian
|
||||
nSrc = 4
|
||||
case src[0] == 0xff && src[1] == 0xfe && src[2] == 0x00 && src[3] == 0x00:
|
||||
u.current.endianness = LittleEndian
|
||||
nSrc = 4
|
||||
default:
|
||||
if u.current.bomPolicy&requireBOM != 0 {
|
||||
return 0, 0, ErrMissingBOM
|
||||
}
|
||||
}
|
||||
u.current.bomPolicy = IgnoreBOM
|
||||
}
|
||||
|
||||
var r rune
|
||||
var dSize, sSize int
|
||||
for nSrc < len(src) {
|
||||
if nSrc+3 < len(src) {
|
||||
x := uint32(src[nSrc+0])<<24 | uint32(src[nSrc+1])<<16 |
|
||||
uint32(src[nSrc+2])<<8 | uint32(src[nSrc+3])
|
||||
if u.current.endianness == LittleEndian {
|
||||
x = x>>24 | (x >> 8 & 0x0000FF00) | (x << 8 & 0x00FF0000) | x<<24
|
||||
}
|
||||
r, sSize = rune(x), 4
|
||||
if dSize = utf8.RuneLen(r); dSize < 0 {
|
||||
r, dSize = utf8.RuneError, 3
|
||||
}
|
||||
} else if atEOF {
|
||||
// 1..3 trailing bytes.
|
||||
r, dSize, sSize = utf8.RuneError, 3, len(src)-nSrc
|
||||
} else {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
if nDst+dSize > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
nDst += utf8.EncodeRune(dst[nDst:], r)
|
||||
nSrc += sSize
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
|
||||
type utf32Encoder struct {
|
||||
endianness Endianness
|
||||
initialBOMPolicy BOMPolicy
|
||||
currentBOMPolicy BOMPolicy
|
||||
}
|
||||
|
||||
func (u *utf32Encoder) Reset() {
|
||||
u.currentBOMPolicy = u.initialBOMPolicy
|
||||
}
|
||||
|
||||
func (u *utf32Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
if u.currentBOMPolicy&writeBOM != 0 {
|
||||
if len(dst) < 4 {
|
||||
return 0, 0, transform.ErrShortDst
|
||||
}
|
||||
dst[0], dst[1], dst[2], dst[3] = 0x00, 0x00, 0xfe, 0xff
|
||||
u.currentBOMPolicy = IgnoreBOM
|
||||
nDst = 4
|
||||
}
|
||||
|
||||
r, size := rune(0), 0
|
||||
for nSrc < len(src) {
|
||||
r = rune(src[nSrc])
|
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf {
|
||||
size = 1
|
||||
|
||||
} else {
|
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:])
|
||||
if size == 1 {
|
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if nDst+4 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
|
||||
dst[nDst+0] = uint8(r >> 24)
|
||||
dst[nDst+1] = uint8(r >> 16)
|
||||
dst[nDst+2] = uint8(r >> 8)
|
||||
dst[nDst+3] = uint8(r)
|
||||
nDst += 4
|
||||
nSrc += size
|
||||
}
|
||||
|
||||
if u.endianness == LittleEndian {
|
||||
for i := 0; i < nDst; i += 4 {
|
||||
dst[i], dst[i+1], dst[i+2], dst[i+3] = dst[i+3], dst[i+2], dst[i+1], dst[i]
|
||||
}
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
+27
@@ -0,0 +1,27 @@
|
||||
Copyright (c) 2009 The Go Authors. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Google Inc. nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
+100
@@ -0,0 +1,100 @@
|
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package tag contains functionality handling tags and related data.
|
||||
package tag // import "golang.org/x/text/internal/tag"
|
||||
|
||||
import "sort"
|
||||
|
||||
// An Index converts tags to a compact numeric value.
|
||||
//
|
||||
// All elements are of size 4. Tags may be up to 4 bytes long. Excess bytes can
|
||||
// be used to store additional information about the tag.
|
||||
type Index string
|
||||
|
||||
// Elem returns the element data at the given index.
|
||||
func (s Index) Elem(x int) string {
|
||||
return string(s[x*4 : x*4+4])
|
||||
}
|
||||
|
||||
// Index reports the index of the given key or -1 if it could not be found.
|
||||
// Only the first len(key) bytes from the start of the 4-byte entries will be
|
||||
// considered for the search and the first match in Index will be returned.
|
||||
func (s Index) Index(key []byte) int {
|
||||
n := len(key)
|
||||
// search the index of the first entry with an equal or higher value than
|
||||
// key in s.
|
||||
index := sort.Search(len(s)/4, func(i int) bool {
|
||||
return cmp(s[i*4:i*4+n], key) != -1
|
||||
})
|
||||
i := index * 4
|
||||
if cmp(s[i:i+len(key)], key) != 0 {
|
||||
return -1
|
||||
}
|
||||
return index
|
||||
}
|
||||
|
||||
// Next finds the next occurrence of key after index x, which must have been
|
||||
// obtained from a call to Index using the same key. It returns x+1 or -1.
|
||||
func (s Index) Next(key []byte, x int) int {
|
||||
if x++; x*4 < len(s) && cmp(s[x*4:x*4+len(key)], key) == 0 {
|
||||
return x
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// cmp returns an integer comparing a and b lexicographically.
|
||||
func cmp(a Index, b []byte) int {
|
||||
n := len(a)
|
||||
if len(b) < n {
|
||||
n = len(b)
|
||||
}
|
||||
for i, c := range b[:n] {
|
||||
switch {
|
||||
case a[i] > c:
|
||||
return 1
|
||||
case a[i] < c:
|
||||
return -1
|
||||
}
|
||||
}
|
||||
switch {
|
||||
case len(a) < len(b):
|
||||
return -1
|
||||
case len(a) > len(b):
|
||||
return 1
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// Compare returns an integer comparing a and b lexicographically.
|
||||
func Compare(a string, b []byte) int {
|
||||
return cmp(Index(a), b)
|
||||
}
|
||||
|
||||
// FixCase reformats b to the same pattern of cases as form.
|
||||
// If returns false if string b is malformed.
|
||||
func FixCase(form string, b []byte) bool {
|
||||
if len(form) != len(b) {
|
||||
return false
|
||||
}
|
||||
for i, c := range b {
|
||||
if form[i] <= 'Z' {
|
||||
if c >= 'a' {
|
||||
c -= 'z' - 'Z'
|
||||
}
|
||||
if c < 'A' || 'Z' < c {
|
||||
return false
|
||||
}
|
||||
} else {
|
||||
if c <= 'Z' {
|
||||
c += 'z' - 'Z'
|
||||
}
|
||||
if c < 'a' || 'z' < c {
|
||||
return false
|
||||
}
|
||||
}
|
||||
b[i] = c
|
||||
}
|
||||
return true
|
||||
}
|
||||
+27
@@ -0,0 +1,27 @@
|
||||
Copyright (c) 2009 The Go Authors. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Google Inc. nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
+87
@@ -0,0 +1,87 @@
|
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package utf8internal contains low-level utf8-related constants, tables, etc.
|
||||
// that are used internally by the text package.
|
||||
package utf8internal
|
||||
|
||||
// The default lowest and highest continuation byte.
|
||||
const (
|
||||
LoCB = 0x80 // 1000 0000
|
||||
HiCB = 0xBF // 1011 1111
|
||||
)
|
||||
|
||||
// Constants related to getting information of first bytes of UTF-8 sequences.
|
||||
const (
|
||||
// ASCII identifies a UTF-8 byte as ASCII.
|
||||
ASCII = as
|
||||
|
||||
// FirstInvalid indicates a byte is invalid as a first byte of a UTF-8
|
||||
// sequence.
|
||||
FirstInvalid = xx
|
||||
|
||||
// SizeMask is a mask for the size bits. Use use x&SizeMask to get the size.
|
||||
SizeMask = 7
|
||||
|
||||
// AcceptShift is the right-shift count for the first byte info byte to get
|
||||
// the index into the AcceptRanges table. See AcceptRanges.
|
||||
AcceptShift = 4
|
||||
|
||||
// The names of these constants are chosen to give nice alignment in the
|
||||
// table below. The first nibble is an index into acceptRanges or F for
|
||||
// special one-byte cases. The second nibble is the Rune length or the
|
||||
// Status for the special one-byte case.
|
||||
xx = 0xF1 // invalid: size 1
|
||||
as = 0xF0 // ASCII: size 1
|
||||
s1 = 0x02 // accept 0, size 2
|
||||
s2 = 0x13 // accept 1, size 3
|
||||
s3 = 0x03 // accept 0, size 3
|
||||
s4 = 0x23 // accept 2, size 3
|
||||
s5 = 0x34 // accept 3, size 4
|
||||
s6 = 0x04 // accept 0, size 4
|
||||
s7 = 0x44 // accept 4, size 4
|
||||
)
|
||||
|
||||
// First is information about the first byte in a UTF-8 sequence.
|
||||
var First = [256]uint8{
|
||||
// 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
|
||||
// 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
|
||||
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
|
||||
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
|
||||
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
|
||||
xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
|
||||
s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
|
||||
s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
|
||||
s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
|
||||
}
|
||||
|
||||
// AcceptRange gives the range of valid values for the second byte in a UTF-8
|
||||
// sequence for any value for First that is not ASCII or FirstInvalid.
|
||||
type AcceptRange struct {
|
||||
Lo uint8 // lowest value for second byte.
|
||||
Hi uint8 // highest value for second byte.
|
||||
}
|
||||
|
||||
// AcceptRanges is a slice of AcceptRange values. For a given byte sequence b
|
||||
//
|
||||
// AcceptRanges[First[b[0]]>>AcceptShift]
|
||||
//
|
||||
// will give the value of AcceptRange for the multi-byte UTF-8 sequence starting
|
||||
// at b[0].
|
||||
var AcceptRanges = [...]AcceptRange{
|
||||
0: {LoCB, HiCB},
|
||||
1: {0xA0, HiCB},
|
||||
2: {LoCB, 0x9F},
|
||||
3: {0x90, HiCB},
|
||||
4: {LoCB, 0x8F},
|
||||
}
|
||||
+27
@@ -0,0 +1,27 @@
|
||||
Copyright (c) 2009 The Go Authors. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Google Inc. nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
+16
@@ -0,0 +1,16 @@
|
||||
// This file was generated by go generate; DO NOT EDIT
|
||||
|
||||
package language
|
||||
|
||||
// This file contains code common to the maketables.go and the package code.
|
||||
|
||||
// langAliasType is the type of an alias in langAliasMap.
|
||||
type langAliasType int8
|
||||
|
||||
const (
|
||||
langDeprecated langAliasType = iota
|
||||
langMacro
|
||||
langLegacy
|
||||
|
||||
langAliasTypeUnknown langAliasType = -1
|
||||
)
|
||||
+197
@@ -0,0 +1,197 @@
|
||||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package language
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
)
|
||||
|
||||
// The Coverage interface is used to define the level of coverage of an
|
||||
// internationalization service. Note that not all types are supported by all
|
||||
// services. As lists may be generated on the fly, it is recommended that users
|
||||
// of a Coverage cache the results.
|
||||
type Coverage interface {
|
||||
// Tags returns the list of supported tags.
|
||||
Tags() []Tag
|
||||
|
||||
// BaseLanguages returns the list of supported base languages.
|
||||
BaseLanguages() []Base
|
||||
|
||||
// Scripts returns the list of supported scripts.
|
||||
Scripts() []Script
|
||||
|
||||
// Regions returns the list of supported regions.
|
||||
Regions() []Region
|
||||
}
|
||||
|
||||
var (
|
||||
// Supported defines a Coverage that lists all supported subtags. Tags
|
||||
// always returns nil.
|
||||
Supported Coverage = allSubtags{}
|
||||
)
|
||||
|
||||
// TODO:
|
||||
// - Support Variants, numbering systems.
|
||||
// - CLDR coverage levels.
|
||||
// - Set of common tags defined in this package.
|
||||
|
||||
type allSubtags struct{}
|
||||
|
||||
// Regions returns the list of supported regions. As all regions are in a
|
||||
// consecutive range, it simply returns a slice of numbers in increasing order.
|
||||
// The "undefined" region is not returned.
|
||||
func (s allSubtags) Regions() []Region {
|
||||
reg := make([]Region, numRegions)
|
||||
for i := range reg {
|
||||
reg[i] = Region{regionID(i + 1)}
|
||||
}
|
||||
return reg
|
||||
}
|
||||
|
||||
// Scripts returns the list of supported scripts. As all scripts are in a
|
||||
// consecutive range, it simply returns a slice of numbers in increasing order.
|
||||
// The "undefined" script is not returned.
|
||||
func (s allSubtags) Scripts() []Script {
|
||||
scr := make([]Script, numScripts)
|
||||
for i := range scr {
|
||||
scr[i] = Script{scriptID(i + 1)}
|
||||
}
|
||||
return scr
|
||||
}
|
||||
|
||||
// BaseLanguages returns the list of all supported base languages. It generates
|
||||
// the list by traversing the internal structures.
|
||||
func (s allSubtags) BaseLanguages() []Base {
|
||||
base := make([]Base, 0, numLanguages)
|
||||
for i := 0; i < langNoIndexOffset; i++ {
|
||||
// We included "und" already for the value 0.
|
||||
if i != nonCanonicalUnd {
|
||||
base = append(base, Base{langID(i)})
|
||||
}
|
||||
}
|
||||
i := langNoIndexOffset
|
||||
for _, v := range langNoIndex {
|
||||
for k := 0; k < 8; k++ {
|
||||
if v&1 == 1 {
|
||||
base = append(base, Base{langID(i)})
|
||||
}
|
||||
v >>= 1
|
||||
i++
|
||||
}
|
||||
}
|
||||
return base
|
||||
}
|
||||
|
||||
// Tags always returns nil.
|
||||
func (s allSubtags) Tags() []Tag {
|
||||
return nil
|
||||
}
|
||||
|
||||
// coverage is used used by NewCoverage which is used as a convenient way for
|
||||
// creating Coverage implementations for partially defined data. Very often a
|
||||
// package will only need to define a subset of slices. coverage provides a
|
||||
// convenient way to do this. Moreover, packages using NewCoverage, instead of
|
||||
// their own implementation, will not break if later new slice types are added.
|
||||
type coverage struct {
|
||||
tags func() []Tag
|
||||
bases func() []Base
|
||||
scripts func() []Script
|
||||
regions func() []Region
|
||||
}
|
||||
|
||||
func (s *coverage) Tags() []Tag {
|
||||
if s.tags == nil {
|
||||
return nil
|
||||
}
|
||||
return s.tags()
|
||||
}
|
||||
|
||||
// bases implements sort.Interface and is used to sort base languages.
|
||||
type bases []Base
|
||||
|
||||
func (b bases) Len() int {
|
||||
return len(b)
|
||||
}
|
||||
|
||||
func (b bases) Swap(i, j int) {
|
||||
b[i], b[j] = b[j], b[i]
|
||||
}
|
||||
|
||||
func (b bases) Less(i, j int) bool {
|
||||
return b[i].langID < b[j].langID
|
||||
}
|
||||
|
||||
// BaseLanguages returns the result from calling s.bases if it is specified or
|
||||
// otherwise derives the set of supported base languages from tags.
|
||||
func (s *coverage) BaseLanguages() []Base {
|
||||
if s.bases == nil {
|
||||
tags := s.Tags()
|
||||
if len(tags) == 0 {
|
||||
return nil
|
||||
}
|
||||
a := make([]Base, len(tags))
|
||||
for i, t := range tags {
|
||||
a[i] = Base{langID(t.lang)}
|
||||
}
|
||||
sort.Sort(bases(a))
|
||||
k := 0
|
||||
for i := 1; i < len(a); i++ {
|
||||
if a[k] != a[i] {
|
||||
k++
|
||||
a[k] = a[i]
|
||||
}
|
||||
}
|
||||
return a[:k+1]
|
||||
}
|
||||
return s.bases()
|
||||
}
|
||||
|
||||
func (s *coverage) Scripts() []Script {
|
||||
if s.scripts == nil {
|
||||
return nil
|
||||
}
|
||||
return s.scripts()
|
||||
}
|
||||
|
||||
func (s *coverage) Regions() []Region {
|
||||
if s.regions == nil {
|
||||
return nil
|
||||
}
|
||||
return s.regions()
|
||||
}
|
||||
|
||||
// NewCoverage returns a Coverage for the given lists. It is typically used by
|
||||
// packages providing internationalization services to define their level of
|
||||
// coverage. A list may be of type []T or func() []T, where T is either Tag,
|
||||
// Base, Script or Region. The returned Coverage derives the value for Bases
|
||||
// from Tags if no func or slice for []Base is specified. For other unspecified
|
||||
// types the returned Coverage will return nil for the respective methods.
|
||||
func NewCoverage(list ...interface{}) Coverage {
|
||||
s := &coverage{}
|
||||
for _, x := range list {
|
||||
switch v := x.(type) {
|
||||
case func() []Base:
|
||||
s.bases = v
|
||||
case func() []Script:
|
||||
s.scripts = v
|
||||
case func() []Region:
|
||||
s.regions = v
|
||||
case func() []Tag:
|
||||
s.tags = v
|
||||
case []Base:
|
||||
s.bases = func() []Base { return v }
|
||||
case []Script:
|
||||
s.scripts = func() []Script { return v }
|
||||
case []Region:
|
||||
s.regions = func() []Region { return v }
|
||||
case []Tag:
|
||||
s.tags = func() []Tag { return v }
|
||||
default:
|
||||
panic(fmt.Sprintf("language: unsupported set type %T", v))
|
||||
}
|
||||
}
|
||||
return s
|
||||
}
|
||||
+92
@@ -0,0 +1,92 @@
|
||||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package display
|
||||
|
||||
// This file contains sets of data for specific languages. Users can use these
|
||||
// to create smaller collections of supported languages and reduce total table
|
||||
// size.
|
||||
|
||||
// The variable names defined here correspond to those in package language.
|
||||
|
||||
var (
|
||||
Afrikaans *Dictionary = &af // af
|
||||
Amharic *Dictionary = &am // am
|
||||
Arabic *Dictionary = &ar // ar
|
||||
ModernStandardArabic *Dictionary = Arabic // ar-001
|
||||
Azerbaijani *Dictionary = &az // az
|
||||
Bulgarian *Dictionary = &bg // bg
|
||||
Bengali *Dictionary = &bn // bn
|
||||
Catalan *Dictionary = &ca // ca
|
||||
Czech *Dictionary = &cs // cs
|
||||
Danish *Dictionary = &da // da
|
||||
German *Dictionary = &de // de
|
||||
Greek *Dictionary = &el // el
|
||||
English *Dictionary = &en // en
|
||||
AmericanEnglish *Dictionary = English // en-US
|
||||
BritishEnglish *Dictionary = English // en-GB
|
||||
Spanish *Dictionary = &es // es
|
||||
EuropeanSpanish *Dictionary = Spanish // es-ES
|
||||
LatinAmericanSpanish *Dictionary = Spanish // es-419
|
||||
Estonian *Dictionary = &et // et
|
||||
Persian *Dictionary = &fa // fa
|
||||
Finnish *Dictionary = &fi // fi
|
||||
Filipino *Dictionary = &fil // fil
|
||||
French *Dictionary = &fr // fr
|
||||
Gujarati *Dictionary = &gu // gu
|
||||
Hebrew *Dictionary = &he // he
|
||||
Hindi *Dictionary = &hi // hi
|
||||
Croatian *Dictionary = &hr // hr
|
||||
Hungarian *Dictionary = &hu // hu
|
||||
Armenian *Dictionary = &hy // hy
|
||||
Indonesian *Dictionary = &id // id
|
||||
Icelandic *Dictionary = &is // is
|
||||
Italian *Dictionary = &it // it
|
||||
Japanese *Dictionary = &ja // ja
|
||||
Georgian *Dictionary = &ka // ka
|
||||
Kazakh *Dictionary = &kk // kk
|
||||
Khmer *Dictionary = &km // km
|
||||
Kannada *Dictionary = &kn // kn
|
||||
Korean *Dictionary = &ko // ko
|
||||
Kirghiz *Dictionary = &ky // ky
|
||||
Lao *Dictionary = &lo // lo
|
||||
Lithuanian *Dictionary = < // lt
|
||||
Latvian *Dictionary = &lv // lv
|
||||
Macedonian *Dictionary = &mk // mk
|
||||
Malayalam *Dictionary = &ml // ml
|
||||
Mongolian *Dictionary = &mn // mn
|
||||
Marathi *Dictionary = &mr // mr
|
||||
Malay *Dictionary = &ms // ms
|
||||
Burmese *Dictionary = &my // my
|
||||
Nepali *Dictionary = &ne // ne
|
||||
Dutch *Dictionary = &nl // nl
|
||||
Norwegian *Dictionary = &no // no
|
||||
Punjabi *Dictionary = &pa // pa
|
||||
Polish *Dictionary = &pl // pl
|
||||
Portuguese *Dictionary = &pt // pt
|
||||
BrazilianPortuguese *Dictionary = Portuguese // pt-BR
|
||||
EuropeanPortuguese *Dictionary = &ptPT // pt-PT
|
||||
Romanian *Dictionary = &ro // ro
|
||||
Russian *Dictionary = &ru // ru
|
||||
Sinhala *Dictionary = &si // si
|
||||
Slovak *Dictionary = &sk // sk
|
||||
Slovenian *Dictionary = &sl // sl
|
||||
Albanian *Dictionary = &sq // sq
|
||||
Serbian *Dictionary = &sr // sr
|
||||
SerbianLatin *Dictionary = &srLatn // sr
|
||||
Swedish *Dictionary = &sv // sv
|
||||
Swahili *Dictionary = &sw // sw
|
||||
Tamil *Dictionary = &ta // ta
|
||||
Telugu *Dictionary = &te // te
|
||||
Thai *Dictionary = &th // th
|
||||
Turkish *Dictionary = &tr // tr
|
||||
Ukrainian *Dictionary = &uk // uk
|
||||
Urdu *Dictionary = &ur // ur
|
||||
Uzbek *Dictionary = &uz // uz
|
||||
Vietnamese *Dictionary = &vi // vi
|
||||
Chinese *Dictionary = &zh // zh
|
||||
SimplifiedChinese *Dictionary = Chinese // zh-Hans
|
||||
TraditionalChinese *Dictionary = &zhHant // zh-Hant
|
||||
Zulu *Dictionary = &zu // zu
|
||||
)
|
||||
+343
@@ -0,0 +1,343 @@
|
||||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:generate go run maketables.go -output tables.go
|
||||
|
||||
// Package display provides display names for languages, scripts and regions in
|
||||
// a requested language.
|
||||
//
|
||||
// The data is based on CLDR's localeDisplayNames. It includes the names of the
|
||||
// draft level "contributed" or "approved". The resulting tables are quite
|
||||
// large. The display package is designed so that users can reduce the linked-in
|
||||
// table sizes by cherry picking the languages one wishes to support. There is a
|
||||
// Dictionary defined for a selected set of common languages for this purpose.
|
||||
package display // import "golang.org/x/text/language/display"
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"golang.org/x/text/language"
|
||||
)
|
||||
|
||||
/*
|
||||
TODO:
|
||||
All fairly low priority at the moment:
|
||||
- Include alternative and variants as an option (using func options).
|
||||
- Option for returning the empty string for undefined values.
|
||||
- Support variants, currencies, time zones, option names and other data
|
||||
provided in CLDR.
|
||||
- Do various optimizations:
|
||||
- Reduce size of offset tables.
|
||||
- Consider compressing infrequently used languages and decompress on demand.
|
||||
*/
|
||||
|
||||
// A Namer is used to get the name for a given value, such as a Tag, Language,
|
||||
// Script or Region.
|
||||
type Namer interface {
|
||||
// Name returns a display string for the given value. A Namer returns an
|
||||
// empty string for values it does not support. A Namer may support naming
|
||||
// an unspecified value. For example, when getting the name for a region for
|
||||
// a tag that does not have a defined Region, it may return the name for an
|
||||
// unknown region. It is up to the user to filter calls to Name for values
|
||||
// for which one does not want to have a name string.
|
||||
Name(x interface{}) string
|
||||
}
|
||||
|
||||
var (
|
||||
// Supported lists the languages for which names are defined.
|
||||
Supported language.Coverage
|
||||
|
||||
// The set of all possible values for which names are defined. Note that not
|
||||
// all Namer implementations will cover all the values of a given type.
|
||||
// A Namer will return the empty string for unsupported values.
|
||||
Values language.Coverage
|
||||
|
||||
matcher language.Matcher
|
||||
)
|
||||
|
||||
func init() {
|
||||
tags := make([]language.Tag, numSupported)
|
||||
s := supported
|
||||
for i := range tags {
|
||||
p := strings.IndexByte(s, '|')
|
||||
tags[i] = language.Raw.Make(s[:p])
|
||||
s = s[p+1:]
|
||||
}
|
||||
matcher = language.NewMatcher(tags)
|
||||
Supported = language.NewCoverage(tags)
|
||||
|
||||
Values = language.NewCoverage(langTagSet.Tags, supportedScripts, supportedRegions)
|
||||
}
|
||||
|
||||
// Languages returns a Namer for naming languages. It returns nil if there is no
|
||||
// data for the given tag. The type passed to Name must be either language.Base
|
||||
// or language.Tag. Note that the result may differ between passing a tag or its
|
||||
// base language. For example, for English, passing "nl-BE" would return Flemish
|
||||
// whereas passing "nl" returns "Dutch".
|
||||
func Languages(t language.Tag) Namer {
|
||||
if _, index, conf := matcher.Match(t); conf != language.No {
|
||||
return languageNamer(index)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
type languageNamer int
|
||||
|
||||
func (n languageNamer) name(i int) string {
|
||||
return lookup(langHeaders[:], int(n), i)
|
||||
}
|
||||
|
||||
// Name implements the Namer interface for language names.
|
||||
func (n languageNamer) Name(x interface{}) string {
|
||||
return nameLanguage(n, x)
|
||||
}
|
||||
|
||||
// nonEmptyIndex walks up the parent chain until a non-empty header is found.
|
||||
// It returns -1 if no index could be found.
|
||||
func nonEmptyIndex(h []header, index int) int {
|
||||
for ; index != -1 && h[index].data == ""; index = int(parents[index]) {
|
||||
}
|
||||
return index
|
||||
}
|
||||
|
||||
// Scripts returns a Namer for naming scripts. It returns nil if there is no
|
||||
// data for the given tag. The type passed to Name must be either a
|
||||
// language.Script or a language.Tag. It will not attempt to infer a script for
|
||||
// tags with an unspecified script.
|
||||
func Scripts(t language.Tag) Namer {
|
||||
if _, index, conf := matcher.Match(t); conf != language.No {
|
||||
if index = nonEmptyIndex(scriptHeaders[:], index); index != -1 {
|
||||
return scriptNamer(index)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
type scriptNamer int
|
||||
|
||||
func (n scriptNamer) name(i int) string {
|
||||
return lookup(scriptHeaders[:], int(n), i)
|
||||
}
|
||||
|
||||
// Name implements the Namer interface for script names.
|
||||
func (n scriptNamer) Name(x interface{}) string {
|
||||
return nameScript(n, x)
|
||||
}
|
||||
|
||||
// Regions returns a Namer for naming regions. It returns nil if there is no
|
||||
// data for the given tag. The type passed to Name must be either a
|
||||
// language.Region or a language.Tag. It will not attempt to infer a region for
|
||||
// tags with an unspecified region.
|
||||
func Regions(t language.Tag) Namer {
|
||||
if _, index, conf := matcher.Match(t); conf != language.No {
|
||||
if index = nonEmptyIndex(regionHeaders[:], index); index != -1 {
|
||||
return regionNamer(index)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
type regionNamer int
|
||||
|
||||
func (n regionNamer) name(i int) string {
|
||||
return lookup(regionHeaders[:], int(n), i)
|
||||
}
|
||||
|
||||
// Name implements the Namer interface for region names.
|
||||
func (n regionNamer) Name(x interface{}) string {
|
||||
return nameRegion(n, x)
|
||||
}
|
||||
|
||||
// Tags returns a Namer for giving a full description of a tag. The names of
|
||||
// scripts and regions that are not already implied by the language name will
|
||||
// in appended within parentheses. It returns nil if there is not data for the
|
||||
// given tag. The type passed to Name must be a tag.
|
||||
func Tags(t language.Tag) Namer {
|
||||
if _, index, conf := matcher.Match(t); conf != language.No {
|
||||
return tagNamer(index)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
type tagNamer int
|
||||
|
||||
// Name implements the Namer interface for tag names.
|
||||
func (n tagNamer) Name(x interface{}) string {
|
||||
return nameTag(languageNamer(n), scriptNamer(n), regionNamer(n), x)
|
||||
}
|
||||
|
||||
// lookup finds the name for an entry in a global table, traversing the
|
||||
// inheritance hierarchy if needed.
|
||||
func lookup(table []header, dict, want int) string {
|
||||
for dict != -1 {
|
||||
if s := table[dict].name(want); s != "" {
|
||||
return s
|
||||
}
|
||||
dict = int(parents[dict])
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// A Dictionary holds a collection of Namers for a single language. One can
|
||||
// reduce the amount of data linked in to a binary by only referencing
|
||||
// Dictionaries for the languages one needs to support instead of using the
|
||||
// generic Namer factories.
|
||||
type Dictionary struct {
|
||||
parent *Dictionary
|
||||
lang header
|
||||
script header
|
||||
region header
|
||||
}
|
||||
|
||||
// Tags returns a Namer for giving a full description of a tag. The names of
|
||||
// scripts and regions that are not already implied by the language name will
|
||||
// in appended within parentheses. It returns nil if there is not data for the
|
||||
// given tag. The type passed to Name must be a tag.
|
||||
func (d *Dictionary) Tags() Namer {
|
||||
return dictTags{d}
|
||||
}
|
||||
|
||||
type dictTags struct {
|
||||
d *Dictionary
|
||||
}
|
||||
|
||||
// Name implements the Namer interface for tag names.
|
||||
func (n dictTags) Name(x interface{}) string {
|
||||
return nameTag(dictLanguages{n.d}, dictScripts{n.d}, dictRegions{n.d}, x)
|
||||
}
|
||||
|
||||
// Languages returns a Namer for naming languages. It returns nil if there is no
|
||||
// data for the given tag. The type passed to Name must be either language.Base
|
||||
// or language.Tag. Note that the result may differ between passing a tag or its
|
||||
// base language. For example, for English, passing "nl-BE" would return Flemish
|
||||
// whereas passing "nl" returns "Dutch".
|
||||
func (d *Dictionary) Languages() Namer {
|
||||
return dictLanguages{d}
|
||||
}
|
||||
|
||||
type dictLanguages struct {
|
||||
d *Dictionary
|
||||
}
|
||||
|
||||
func (n dictLanguages) name(i int) string {
|
||||
for d := n.d; d != nil; d = d.parent {
|
||||
if s := d.lang.name(i); s != "" {
|
||||
return s
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// Name implements the Namer interface for language names.
|
||||
func (n dictLanguages) Name(x interface{}) string {
|
||||
return nameLanguage(n, x)
|
||||
}
|
||||
|
||||
// Scripts returns a Namer for naming scripts. It returns nil if there is no
|
||||
// data for the given tag. The type passed to Name must be either a
|
||||
// language.Script or a language.Tag. It will not attempt to infer a script for
|
||||
// tags with an unspecified script.
|
||||
func (d *Dictionary) Scripts() Namer {
|
||||
return dictScripts{d}
|
||||
}
|
||||
|
||||
type dictScripts struct {
|
||||
d *Dictionary
|
||||
}
|
||||
|
||||
func (n dictScripts) name(i int) string {
|
||||
for d := n.d; d != nil; d = d.parent {
|
||||
if s := d.script.name(i); s != "" {
|
||||
return s
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// Name implements the Namer interface for script names.
|
||||
func (n dictScripts) Name(x interface{}) string {
|
||||
return nameScript(n, x)
|
||||
}
|
||||
|
||||
// Regions returns a Namer for naming regions. It returns nil if there is no
|
||||
// data for the given tag. The type passed to Name must be either a
|
||||
// language.Region or a language.Tag. It will not attempt to infer a region for
|
||||
// tags with an unspecified region.
|
||||
func (d *Dictionary) Regions() Namer {
|
||||
return dictRegions{d}
|
||||
}
|
||||
|
||||
type dictRegions struct {
|
||||
d *Dictionary
|
||||
}
|
||||
|
||||
func (n dictRegions) name(i int) string {
|
||||
for d := n.d; d != nil; d = d.parent {
|
||||
if s := d.region.name(i); s != "" {
|
||||
return s
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// Name implements the Namer interface for region names.
|
||||
func (n dictRegions) Name(x interface{}) string {
|
||||
return nameRegion(n, x)
|
||||
}
|
||||
|
||||
// A SelfNamer implements a Namer that returns the name of language in this same
|
||||
// language. It provides a very compact mechanism to provide a comprehensive
|
||||
// list of languages to users in their native language.
|
||||
type SelfNamer struct {
|
||||
// Supported defines the values supported by this Namer.
|
||||
Supported language.Coverage
|
||||
}
|
||||
|
||||
var (
|
||||
// Self is a shared instance of a SelfNamer.
|
||||
Self *SelfNamer = &self
|
||||
|
||||
self = SelfNamer{language.NewCoverage(selfTagSet.Tags)}
|
||||
)
|
||||
|
||||
// Name returns the name of a given language tag in the language identified by
|
||||
// this tag. It supports both the language.Base and language.Tag types.
|
||||
func (n SelfNamer) Name(x interface{}) string {
|
||||
t, _ := language.All.Compose(x)
|
||||
base, scr, reg := t.Raw()
|
||||
baseScript := language.Script{}
|
||||
if (scr == language.Script{} && reg != language.Region{}) {
|
||||
// For looking up in the self dictionary, we need to select the
|
||||
// maximized script. This is even the case if the script isn't
|
||||
// specified.
|
||||
s1, _ := t.Script()
|
||||
if baseScript = getScript(base); baseScript != s1 {
|
||||
scr = s1
|
||||
}
|
||||
}
|
||||
|
||||
i, scr, reg := selfTagSet.index(base, scr, reg)
|
||||
if i == -1 {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Only return the display name if the script matches the expected script.
|
||||
if (scr != language.Script{}) {
|
||||
if (baseScript == language.Script{}) {
|
||||
baseScript = getScript(base)
|
||||
}
|
||||
if baseScript != scr {
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
return selfHeaders[0].name(i)
|
||||
}
|
||||
|
||||
// getScript returns the maximized script for a base language.
|
||||
func getScript(b language.Base) language.Script {
|
||||
tag, _ := language.Raw.Compose(b)
|
||||
scr, _ := tag.Script()
|
||||
return scr
|
||||
}
|
||||
+251
@@ -0,0 +1,251 @@
|
||||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package display
|
||||
|
||||
// This file contains common lookup code that is shared between the various
|
||||
// implementations of Namer and Dictionaries.
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/text/language"
|
||||
)
|
||||
|
||||
type namer interface {
|
||||
// name gets the string for the given index. It should walk the
|
||||
// inheritance chain if a value is not present in the base index.
|
||||
name(idx int) string
|
||||
}
|
||||
|
||||
func nameLanguage(n namer, x interface{}) string {
|
||||
t, _ := language.All.Compose(x)
|
||||
for {
|
||||
i, _, _ := langTagSet.index(t.Raw())
|
||||
if s := n.name(i); s != "" {
|
||||
return s
|
||||
}
|
||||
if t = t.Parent(); t == language.Und {
|
||||
return ""
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func nameScript(n namer, x interface{}) string {
|
||||
t, _ := language.DeprecatedScript.Compose(x)
|
||||
_, s, _ := t.Raw()
|
||||
return n.name(scriptIndex.index(s.String()))
|
||||
}
|
||||
|
||||
func nameRegion(n namer, x interface{}) string {
|
||||
t, _ := language.DeprecatedRegion.Compose(x)
|
||||
_, _, r := t.Raw()
|
||||
return n.name(regionIndex.index(r.String()))
|
||||
}
|
||||
|
||||
func nameTag(langN, scrN, regN namer, x interface{}) string {
|
||||
t, ok := x.(language.Tag)
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
const form = language.All &^ language.SuppressScript
|
||||
if c, err := form.Canonicalize(t); err == nil {
|
||||
t = c
|
||||
}
|
||||
_, sRaw, rRaw := t.Raw()
|
||||
i, scr, reg := langTagSet.index(t.Raw())
|
||||
for i != -1 {
|
||||
if str := langN.name(i); str != "" {
|
||||
if hasS, hasR := (scr != language.Script{}), (reg != language.Region{}); hasS || hasR {
|
||||
ss, sr := "", ""
|
||||
if hasS {
|
||||
ss = scrN.name(scriptIndex.index(scr.String()))
|
||||
}
|
||||
if hasR {
|
||||
sr = regN.name(regionIndex.index(reg.String()))
|
||||
}
|
||||
// TODO: use patterns in CLDR or at least confirm they are the
|
||||
// same for all languages.
|
||||
if ss != "" && sr != "" {
|
||||
return fmt.Sprintf("%s (%s, %s)", str, ss, sr)
|
||||
}
|
||||
if ss != "" || sr != "" {
|
||||
return fmt.Sprintf("%s (%s%s)", str, ss, sr)
|
||||
}
|
||||
}
|
||||
return str
|
||||
}
|
||||
scr, reg = sRaw, rRaw
|
||||
if t = t.Parent(); t == language.Und {
|
||||
return ""
|
||||
}
|
||||
i, _, _ = langTagSet.index(t.Raw())
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// header contains the data and indexes for a single namer.
|
||||
// data contains a series of strings concatenated into one. index contains the
|
||||
// offsets for a string in data. For example, consider a header that defines
|
||||
// strings for the languages de, el, en, fi, and nl:
|
||||
//
|
||||
// header{
|
||||
// data: "GermanGreekEnglishDutch",
|
||||
// index: []uint16{ 0, 6, 11, 18, 18, 23 },
|
||||
// }
|
||||
//
|
||||
// For a language with index i, the string is defined by
|
||||
// data[index[i]:index[i+1]]. So the number of elements in index is always one
|
||||
// greater than the number of languages for which header defines a value.
|
||||
// A string for a language may be empty, which means the name is undefined. In
|
||||
// the above example, the name for fi (Finnish) is undefined.
|
||||
type header struct {
|
||||
data string
|
||||
index []uint16
|
||||
}
|
||||
|
||||
// name looks up the name for a tag in the dictionary, given its index.
|
||||
func (h *header) name(i int) string {
|
||||
if 0 <= i && i < len(h.index)-1 {
|
||||
return h.data[h.index[i]:h.index[i+1]]
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// tagSet is used to find the index of a language in a set of tags.
|
||||
type tagSet struct {
|
||||
single tagIndex
|
||||
long []string
|
||||
}
|
||||
|
||||
var (
|
||||
langTagSet = tagSet{
|
||||
single: langIndex,
|
||||
long: langTagsLong,
|
||||
}
|
||||
|
||||
// selfTagSet is used for indexing the language strings in their own
|
||||
// language.
|
||||
selfTagSet = tagSet{
|
||||
single: selfIndex,
|
||||
long: selfTagsLong,
|
||||
}
|
||||
|
||||
zzzz = language.MustParseScript("Zzzz")
|
||||
zz = language.MustParseRegion("ZZ")
|
||||
)
|
||||
|
||||
// index returns the index of the tag for the given base, script and region or
|
||||
// its parent if the tag is not available. If the match is for a parent entry,
|
||||
// the excess script and region are returned.
|
||||
func (ts *tagSet) index(base language.Base, scr language.Script, reg language.Region) (int, language.Script, language.Region) {
|
||||
lang := base.String()
|
||||
index := -1
|
||||
if (scr != language.Script{} || reg != language.Region{}) {
|
||||
if scr == zzzz {
|
||||
scr = language.Script{}
|
||||
}
|
||||
if reg == zz {
|
||||
reg = language.Region{}
|
||||
}
|
||||
|
||||
i := sort.SearchStrings(ts.long, lang)
|
||||
// All entries have either a script or a region and not both.
|
||||
scrStr, regStr := scr.String(), reg.String()
|
||||
for ; i < len(ts.long) && strings.HasPrefix(ts.long[i], lang); i++ {
|
||||
if s := ts.long[i][len(lang)+1:]; s == scrStr {
|
||||
scr = language.Script{}
|
||||
index = i + ts.single.len()
|
||||
break
|
||||
} else if s == regStr {
|
||||
reg = language.Region{}
|
||||
index = i + ts.single.len()
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if index == -1 {
|
||||
index = ts.single.index(lang)
|
||||
}
|
||||
return index, scr, reg
|
||||
}
|
||||
|
||||
func (ts *tagSet) Tags() []language.Tag {
|
||||
tags := make([]language.Tag, 0, ts.single.len()+len(ts.long))
|
||||
ts.single.keys(func(s string) {
|
||||
tags = append(tags, language.Raw.MustParse(s))
|
||||
})
|
||||
for _, s := range ts.long {
|
||||
tags = append(tags, language.Raw.MustParse(s))
|
||||
}
|
||||
return tags
|
||||
}
|
||||
|
||||
func supportedScripts() []language.Script {
|
||||
scr := make([]language.Script, 0, scriptIndex.len())
|
||||
scriptIndex.keys(func(s string) {
|
||||
scr = append(scr, language.MustParseScript(s))
|
||||
})
|
||||
return scr
|
||||
}
|
||||
|
||||
func supportedRegions() []language.Region {
|
||||
reg := make([]language.Region, 0, regionIndex.len())
|
||||
regionIndex.keys(func(s string) {
|
||||
reg = append(reg, language.MustParseRegion(s))
|
||||
})
|
||||
return reg
|
||||
}
|
||||
|
||||
// tagIndex holds a concatenated lists of subtags of length 2 to 4, one string
|
||||
// for each length, which can be used in combination with binary search to get
|
||||
// the index associated with a tag.
|
||||
// For example, a tagIndex{
|
||||
// "arenesfrruzh", // 6 2-byte tags.
|
||||
// "barwae", // 2 3-byte tags.
|
||||
// "",
|
||||
// }
|
||||
// would mean that the 2-byte tag "fr" had an index of 3, and the 3-byte tag
|
||||
// "wae" had an index of 7.
|
||||
type tagIndex [3]string
|
||||
|
||||
func (t *tagIndex) index(s string) int {
|
||||
sz := len(s)
|
||||
if sz < 2 || 4 < sz {
|
||||
return -1
|
||||
}
|
||||
a := t[sz-2]
|
||||
index := sort.Search(len(a)/sz, func(i int) bool {
|
||||
p := i * sz
|
||||
return a[p:p+sz] >= s
|
||||
})
|
||||
p := index * sz
|
||||
if end := p + sz; end > len(a) || a[p:end] != s {
|
||||
return -1
|
||||
}
|
||||
// Add the number of tags for smaller sizes.
|
||||
for i := 0; i < sz-2; i++ {
|
||||
index += len(t[i]) / (i + 2)
|
||||
}
|
||||
return index
|
||||
}
|
||||
|
||||
// len returns the number of tags that are contained in the tagIndex.
|
||||
func (t *tagIndex) len() (n int) {
|
||||
for i, s := range t {
|
||||
n += len(s) / (i + 2)
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// keys calls f for each tag.
|
||||
func (t *tagIndex) keys(f func(key string)) {
|
||||
for i, s := range *t {
|
||||
for ; s != ""; s = s[i+2:] {
|
||||
f(s[:i+2])
|
||||
}
|
||||
}
|
||||
}
|
||||
+596
@@ -0,0 +1,596 @@
|
||||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
// Generator for display name tables.
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
"reflect"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/text/internal/gen"
|
||||
"golang.org/x/text/language"
|
||||
"golang.org/x/text/unicode/cldr"
|
||||
)
|
||||
|
||||
var (
|
||||
test = flag.Bool("test", false,
|
||||
"test existing tables; can be used to compare web data with package data.")
|
||||
outputFile = flag.String("output", "tables.go", "output file")
|
||||
|
||||
stats = flag.Bool("stats", false, "prints statistics to stderr")
|
||||
|
||||
short = flag.Bool("short", false, `Use "short" alternatives, when available.`)
|
||||
draft = flag.String("draft",
|
||||
"contributed",
|
||||
`Minimal draft requirements (approved, contributed, provisional, unconfirmed).`)
|
||||
pkg = flag.String("package",
|
||||
"display",
|
||||
"the name of the package in which the generated file is to be included")
|
||||
|
||||
tags = newTagSet("tags",
|
||||
[]language.Tag{},
|
||||
"space-separated list of tags to include or empty for all")
|
||||
dict = newTagSet("dict",
|
||||
dictTags(),
|
||||
"space-separated list or tags for which to include a Dictionary. "+
|
||||
`"" means the common list from go.text/language.`)
|
||||
)
|
||||
|
||||
func dictTags() (tag []language.Tag) {
|
||||
// TODO: replace with language.Common.Tags() once supported.
|
||||
const str = "af am ar ar-001 az bg bn ca cs da de el en en-US en-GB " +
|
||||
"es es-ES es-419 et fa fi fil fr fr-CA gu he hi hr hu hy id is it ja " +
|
||||
"ka kk km kn ko ky lo lt lv mk ml mn mr ms my ne nl no pa pl pt pt-BR " +
|
||||
"pt-PT ro ru si sk sl sq sr sr-Latn sv sw ta te th tr uk ur uz vi " +
|
||||
"zh zh-Hans zh-Hant zu"
|
||||
|
||||
for _, s := range strings.Split(str, " ") {
|
||||
tag = append(tag, language.MustParse(s))
|
||||
}
|
||||
return tag
|
||||
}
|
||||
|
||||
func main() {
|
||||
gen.Init()
|
||||
|
||||
// Read the CLDR zip file.
|
||||
r := gen.OpenCLDRCoreZip()
|
||||
defer r.Close()
|
||||
|
||||
d := &cldr.Decoder{}
|
||||
d.SetDirFilter("main", "supplemental")
|
||||
d.SetSectionFilter("localeDisplayNames")
|
||||
data, err := d.DecodeZip(r)
|
||||
if err != nil {
|
||||
log.Fatalf("DecodeZip: %v", err)
|
||||
}
|
||||
|
||||
w := gen.NewCodeWriter()
|
||||
defer w.WriteGoFile(*outputFile, "display")
|
||||
|
||||
gen.WriteCLDRVersion(w)
|
||||
|
||||
b := builder{
|
||||
w: w,
|
||||
data: data,
|
||||
group: make(map[string]*group),
|
||||
}
|
||||
b.generate()
|
||||
}
|
||||
|
||||
const tagForm = language.All
|
||||
|
||||
// tagSet is used to parse command line flags of tags. It implements the
|
||||
// flag.Value interface.
|
||||
type tagSet map[language.Tag]bool
|
||||
|
||||
func newTagSet(name string, tags []language.Tag, usage string) tagSet {
|
||||
f := tagSet(make(map[language.Tag]bool))
|
||||
for _, t := range tags {
|
||||
f[t] = true
|
||||
}
|
||||
flag.Var(f, name, usage)
|
||||
return f
|
||||
}
|
||||
|
||||
// String implements the String method of the flag.Value interface.
|
||||
func (f tagSet) String() string {
|
||||
tags := []string{}
|
||||
for t := range f {
|
||||
tags = append(tags, t.String())
|
||||
}
|
||||
sort.Strings(tags)
|
||||
return strings.Join(tags, " ")
|
||||
}
|
||||
|
||||
// Set implements Set from the flag.Value interface.
|
||||
func (f tagSet) Set(s string) error {
|
||||
if s != "" {
|
||||
for _, s := range strings.Split(s, " ") {
|
||||
if s != "" {
|
||||
tag, err := tagForm.Parse(s)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
f[tag] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (f tagSet) contains(t language.Tag) bool {
|
||||
if len(f) == 0 {
|
||||
return true
|
||||
}
|
||||
return f[t]
|
||||
}
|
||||
|
||||
// builder is used to create all tables with display name information.
|
||||
type builder struct {
|
||||
w *gen.CodeWriter
|
||||
|
||||
data *cldr.CLDR
|
||||
|
||||
fromLocs []string
|
||||
|
||||
// destination tags for the current locale.
|
||||
toTags []string
|
||||
toTagIndex map[string]int
|
||||
|
||||
// list of supported tags
|
||||
supported []language.Tag
|
||||
|
||||
// key-value pairs per group
|
||||
group map[string]*group
|
||||
|
||||
// statistics
|
||||
sizeIndex int // total size of all indexes of headers
|
||||
sizeData int // total size of all data of headers
|
||||
totalSize int
|
||||
}
|
||||
|
||||
type group struct {
|
||||
// Maps from a given language to the Namer data for this language.
|
||||
lang map[language.Tag]keyValues
|
||||
headers []header
|
||||
|
||||
toTags []string
|
||||
threeStart int
|
||||
fourPlusStart int
|
||||
}
|
||||
|
||||
// set sets the typ to the name for locale loc.
|
||||
func (g *group) set(t language.Tag, typ, name string) {
|
||||
kv := g.lang[t]
|
||||
if kv == nil {
|
||||
kv = make(keyValues)
|
||||
g.lang[t] = kv
|
||||
}
|
||||
if kv[typ] == "" {
|
||||
kv[typ] = name
|
||||
}
|
||||
}
|
||||
|
||||
type keyValues map[string]string
|
||||
|
||||
type header struct {
|
||||
tag language.Tag
|
||||
data string
|
||||
index []uint16
|
||||
}
|
||||
|
||||
var versionInfo = `// Version is deprecated. Use CLDRVersion.
|
||||
const Version = %#v
|
||||
|
||||
`
|
||||
|
||||
var self = language.MustParse("mul")
|
||||
|
||||
// generate builds and writes all tables.
|
||||
func (b *builder) generate() {
|
||||
fmt.Fprintf(b.w, versionInfo, cldr.Version)
|
||||
|
||||
b.filter()
|
||||
b.setData("lang", func(g *group, loc language.Tag, ldn *cldr.LocaleDisplayNames) {
|
||||
if ldn.Languages != nil {
|
||||
for _, v := range ldn.Languages.Language {
|
||||
tag := tagForm.MustParse(v.Type)
|
||||
if tags.contains(tag) {
|
||||
g.set(loc, tag.String(), v.Data())
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
b.setData("script", func(g *group, loc language.Tag, ldn *cldr.LocaleDisplayNames) {
|
||||
if ldn.Scripts != nil {
|
||||
for _, v := range ldn.Scripts.Script {
|
||||
code := language.MustParseScript(v.Type)
|
||||
if code.IsPrivateUse() { // Qaaa..Qabx
|
||||
// TODO: data currently appears to be very meager.
|
||||
// Reconsider if we have data for English.
|
||||
if loc == language.English {
|
||||
log.Fatal("Consider including data for private use scripts.")
|
||||
}
|
||||
continue
|
||||
}
|
||||
g.set(loc, code.String(), v.Data())
|
||||
}
|
||||
}
|
||||
})
|
||||
b.setData("region", func(g *group, loc language.Tag, ldn *cldr.LocaleDisplayNames) {
|
||||
if ldn.Territories != nil {
|
||||
for _, v := range ldn.Territories.Territory {
|
||||
g.set(loc, language.MustParseRegion(v.Type).String(), v.Data())
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
b.makeSupported()
|
||||
|
||||
b.writeParents()
|
||||
|
||||
b.writeGroup("lang")
|
||||
b.writeGroup("script")
|
||||
b.writeGroup("region")
|
||||
|
||||
b.w.WriteConst("numSupported", len(b.supported))
|
||||
buf := bytes.Buffer{}
|
||||
for _, tag := range b.supported {
|
||||
fmt.Fprint(&buf, tag.String(), "|")
|
||||
}
|
||||
b.w.WriteConst("supported", buf.String())
|
||||
|
||||
b.writeDictionaries()
|
||||
|
||||
b.supported = []language.Tag{self}
|
||||
|
||||
// Compute the names of locales in their own language. Some of these names
|
||||
// may be specified in their parent locales. We iterate the maximum depth
|
||||
// of the parent three times to match successive parents of tags until a
|
||||
// possible match is found.
|
||||
for i := 0; i < 4; i++ {
|
||||
b.setData("self", func(g *group, tag language.Tag, ldn *cldr.LocaleDisplayNames) {
|
||||
parent := tag
|
||||
if b, s, r := tag.Raw(); i > 0 && (s != language.Script{} && r == language.Region{}) {
|
||||
parent, _ = language.Raw.Compose(b)
|
||||
}
|
||||
if ldn.Languages != nil {
|
||||
for _, v := range ldn.Languages.Language {
|
||||
key := tagForm.MustParse(v.Type)
|
||||
saved := key
|
||||
if key == parent {
|
||||
g.set(self, tag.String(), v.Data())
|
||||
}
|
||||
for k := 0; k < i; k++ {
|
||||
key = key.Parent()
|
||||
}
|
||||
if key == tag {
|
||||
g.set(self, saved.String(), v.Data()) // set does not overwrite a value.
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
b.writeGroup("self")
|
||||
}
|
||||
|
||||
func (b *builder) setData(name string, f func(*group, language.Tag, *cldr.LocaleDisplayNames)) {
|
||||
b.sizeIndex = 0
|
||||
b.sizeData = 0
|
||||
b.toTags = nil
|
||||
b.fromLocs = nil
|
||||
b.toTagIndex = make(map[string]int)
|
||||
|
||||
g := b.group[name]
|
||||
if g == nil {
|
||||
g = &group{lang: make(map[language.Tag]keyValues)}
|
||||
b.group[name] = g
|
||||
}
|
||||
for _, loc := range b.data.Locales() {
|
||||
// We use RawLDML instead of LDML as we are managing our own inheritance
|
||||
// in this implementation.
|
||||
ldml := b.data.RawLDML(loc)
|
||||
|
||||
// We do not support the POSIX variant (it is not a supported BCP 47
|
||||
// variant). This locale also doesn't happen to contain any data, so
|
||||
// we'll skip it by checking for this.
|
||||
tag, err := tagForm.Parse(loc)
|
||||
if err != nil {
|
||||
if ldml.LocaleDisplayNames != nil {
|
||||
log.Fatalf("setData: %v", err)
|
||||
}
|
||||
continue
|
||||
}
|
||||
if ldml.LocaleDisplayNames != nil && tags.contains(tag) {
|
||||
f(g, tag, ldml.LocaleDisplayNames)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (b *builder) filter() {
|
||||
filter := func(s *cldr.Slice) {
|
||||
if *short {
|
||||
s.SelectOnePerGroup("alt", []string{"short", ""})
|
||||
} else {
|
||||
s.SelectOnePerGroup("alt", []string{"stand-alone", ""})
|
||||
}
|
||||
d, err := cldr.ParseDraft(*draft)
|
||||
if err != nil {
|
||||
log.Fatalf("filter: %v", err)
|
||||
}
|
||||
s.SelectDraft(d)
|
||||
}
|
||||
for _, loc := range b.data.Locales() {
|
||||
if ldn := b.data.RawLDML(loc).LocaleDisplayNames; ldn != nil {
|
||||
if ldn.Languages != nil {
|
||||
s := cldr.MakeSlice(&ldn.Languages.Language)
|
||||
if filter(&s); len(ldn.Languages.Language) == 0 {
|
||||
ldn.Languages = nil
|
||||
}
|
||||
}
|
||||
if ldn.Scripts != nil {
|
||||
s := cldr.MakeSlice(&ldn.Scripts.Script)
|
||||
if filter(&s); len(ldn.Scripts.Script) == 0 {
|
||||
ldn.Scripts = nil
|
||||
}
|
||||
}
|
||||
if ldn.Territories != nil {
|
||||
s := cldr.MakeSlice(&ldn.Territories.Territory)
|
||||
if filter(&s); len(ldn.Territories.Territory) == 0 {
|
||||
ldn.Territories = nil
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// makeSupported creates a list of all supported locales.
|
||||
func (b *builder) makeSupported() {
|
||||
// tags across groups
|
||||
for _, g := range b.group {
|
||||
for t, _ := range g.lang {
|
||||
b.supported = append(b.supported, t)
|
||||
}
|
||||
}
|
||||
b.supported = b.supported[:unique(tagsSorter(b.supported))]
|
||||
|
||||
}
|
||||
|
||||
type tagsSorter []language.Tag
|
||||
|
||||
func (a tagsSorter) Len() int { return len(a) }
|
||||
func (a tagsSorter) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||
func (a tagsSorter) Less(i, j int) bool { return a[i].String() < a[j].String() }
|
||||
|
||||
func (b *builder) writeGroup(name string) {
|
||||
g := b.group[name]
|
||||
|
||||
for _, kv := range g.lang {
|
||||
for t, _ := range kv {
|
||||
g.toTags = append(g.toTags, t)
|
||||
}
|
||||
}
|
||||
g.toTags = g.toTags[:unique(tagsBySize(g.toTags))]
|
||||
|
||||
// Allocate header per supported value.
|
||||
g.headers = make([]header, len(b.supported))
|
||||
for i, sup := range b.supported {
|
||||
kv, ok := g.lang[sup]
|
||||
if !ok {
|
||||
g.headers[i].tag = sup
|
||||
continue
|
||||
}
|
||||
data := []byte{}
|
||||
index := make([]uint16, len(g.toTags), len(g.toTags)+1)
|
||||
for j, t := range g.toTags {
|
||||
index[j] = uint16(len(data))
|
||||
data = append(data, kv[t]...)
|
||||
}
|
||||
index = append(index, uint16(len(data)))
|
||||
|
||||
// Trim the tail of the index.
|
||||
// TODO: indexes can be reduced in size quite a bit more.
|
||||
n := len(index)
|
||||
for ; n >= 2 && index[n-2] == index[n-1]; n-- {
|
||||
}
|
||||
index = index[:n]
|
||||
|
||||
// Workaround for a bug in CLDR 26.
|
||||
// See http://unicode.org/cldr/trac/ticket/8042.
|
||||
if cldr.Version == "26" && sup.String() == "hsb" {
|
||||
data = bytes.Replace(data, []byte{'"'}, nil, 1)
|
||||
}
|
||||
g.headers[i] = header{sup, string(data), index}
|
||||
}
|
||||
g.writeTable(b.w, name)
|
||||
}
|
||||
|
||||
type tagsBySize []string
|
||||
|
||||
func (l tagsBySize) Len() int { return len(l) }
|
||||
func (l tagsBySize) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
|
||||
func (l tagsBySize) Less(i, j int) bool {
|
||||
a, b := l[i], l[j]
|
||||
// Sort single-tag entries based on size first. Otherwise alphabetic.
|
||||
if len(a) != len(b) && (len(a) <= 4 || len(b) <= 4) {
|
||||
return len(a) < len(b)
|
||||
}
|
||||
return a < b
|
||||
}
|
||||
|
||||
// parentIndices returns slice a of len(tags) where tags[a[i]] is the parent
|
||||
// of tags[i].
|
||||
func parentIndices(tags []language.Tag) []int16 {
|
||||
index := make(map[language.Tag]int16)
|
||||
for i, t := range tags {
|
||||
index[t] = int16(i)
|
||||
}
|
||||
|
||||
// Construct default parents.
|
||||
parents := make([]int16, len(tags))
|
||||
for i, t := range tags {
|
||||
parents[i] = -1
|
||||
for t = t.Parent(); t != language.Und; t = t.Parent() {
|
||||
if j, ok := index[t]; ok {
|
||||
parents[i] = j
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return parents
|
||||
}
|
||||
|
||||
func (b *builder) writeParents() {
|
||||
parents := parentIndices(b.supported)
|
||||
fmt.Fprintf(b.w, "var parents = ")
|
||||
b.w.WriteArray(parents)
|
||||
}
|
||||
|
||||
// writeKeys writes keys to a special index used by the display package.
|
||||
// tags are assumed to be sorted by length.
|
||||
func writeKeys(w *gen.CodeWriter, name string, keys []string) {
|
||||
w.Size += int(3 * reflect.TypeOf("").Size())
|
||||
w.WriteComment("Number of keys: %d", len(keys))
|
||||
fmt.Fprintf(w, "var (\n\t%sIndex = tagIndex{\n", name)
|
||||
for i := 2; i <= 4; i++ {
|
||||
sub := []string{}
|
||||
for _, t := range keys {
|
||||
if len(t) != i {
|
||||
break
|
||||
}
|
||||
sub = append(sub, t)
|
||||
}
|
||||
s := strings.Join(sub, "")
|
||||
w.WriteString(s)
|
||||
fmt.Fprintf(w, ",\n")
|
||||
keys = keys[len(sub):]
|
||||
}
|
||||
fmt.Fprintln(w, "\t}")
|
||||
if len(keys) > 0 {
|
||||
w.Size += int(reflect.TypeOf([]string{}).Size())
|
||||
fmt.Fprintf(w, "\t%sTagsLong = ", name)
|
||||
w.WriteSlice(keys)
|
||||
}
|
||||
fmt.Fprintln(w, ")\n")
|
||||
}
|
||||
|
||||
// identifier creates an identifier from the given tag.
|
||||
func identifier(t language.Tag) string {
|
||||
return strings.Replace(t.String(), "-", "", -1)
|
||||
}
|
||||
|
||||
func (h *header) writeEntry(w *gen.CodeWriter, name string) {
|
||||
if len(dict) > 0 && dict.contains(h.tag) {
|
||||
fmt.Fprintf(w, "\t{ // %s\n", h.tag)
|
||||
fmt.Fprintf(w, "\t\t%[1]s%[2]sStr,\n\t\t%[1]s%[2]sIdx,\n", identifier(h.tag), name)
|
||||
fmt.Fprintln(w, "\t},")
|
||||
} else if len(h.data) == 0 {
|
||||
fmt.Fprintln(w, "\t\t{}, //", h.tag)
|
||||
} else {
|
||||
fmt.Fprintf(w, "\t{ // %s\n", h.tag)
|
||||
w.WriteString(h.data)
|
||||
fmt.Fprintln(w, ",")
|
||||
w.WriteSlice(h.index)
|
||||
fmt.Fprintln(w, ",\n\t},")
|
||||
}
|
||||
}
|
||||
|
||||
// write the data for the given header as single entries. The size for this data
|
||||
// was already accounted for in writeEntry.
|
||||
func (h *header) writeSingle(w *gen.CodeWriter, name string) {
|
||||
if len(dict) > 0 && dict.contains(h.tag) {
|
||||
tag := identifier(h.tag)
|
||||
w.WriteConst(tag+name+"Str", h.data)
|
||||
|
||||
// Note that we create a slice instead of an array. If we use an array
|
||||
// we need to refer to it as a[:] in other tables, which will cause the
|
||||
// array to always be included by the linker. See Issue 7651.
|
||||
w.WriteVar(tag+name+"Idx", h.index)
|
||||
}
|
||||
}
|
||||
|
||||
// WriteTable writes an entry for a single Namer.
|
||||
func (g *group) writeTable(w *gen.CodeWriter, name string) {
|
||||
start := w.Size
|
||||
writeKeys(w, name, g.toTags)
|
||||
w.Size += len(g.headers) * int(reflect.ValueOf(g.headers[0]).Type().Size())
|
||||
|
||||
fmt.Fprintf(w, "var %sHeaders = [%d]header{\n", name, len(g.headers))
|
||||
|
||||
title := strings.Title(name)
|
||||
for _, h := range g.headers {
|
||||
h.writeEntry(w, title)
|
||||
}
|
||||
fmt.Fprintln(w, "}\n")
|
||||
|
||||
for _, h := range g.headers {
|
||||
h.writeSingle(w, title)
|
||||
}
|
||||
n := w.Size - start
|
||||
fmt.Fprintf(w, "// Total size for %s: %d bytes (%d KB)\n\n", name, n, n/1000)
|
||||
}
|
||||
|
||||
func (b *builder) writeDictionaries() {
|
||||
fmt.Fprintln(b.w, "// Dictionary entries of frequent languages")
|
||||
fmt.Fprintln(b.w, "var (")
|
||||
parents := parentIndices(b.supported)
|
||||
|
||||
for i, t := range b.supported {
|
||||
if dict.contains(t) {
|
||||
ident := identifier(t)
|
||||
fmt.Fprintf(b.w, "\t%s = Dictionary{ // %s\n", ident, t)
|
||||
if p := parents[i]; p == -1 {
|
||||
fmt.Fprintln(b.w, "\t\tnil,")
|
||||
} else {
|
||||
fmt.Fprintf(b.w, "\t\t&%s,\n", identifier(b.supported[p]))
|
||||
}
|
||||
fmt.Fprintf(b.w, "\t\theader{%[1]sLangStr, %[1]sLangIdx},\n", ident)
|
||||
fmt.Fprintf(b.w, "\t\theader{%[1]sScriptStr, %[1]sScriptIdx},\n", ident)
|
||||
fmt.Fprintf(b.w, "\t\theader{%[1]sRegionStr, %[1]sRegionIdx},\n", ident)
|
||||
fmt.Fprintln(b.w, "\t}")
|
||||
}
|
||||
}
|
||||
fmt.Fprintln(b.w, ")")
|
||||
|
||||
var s string
|
||||
var a []uint16
|
||||
sz := reflect.TypeOf(s).Size()
|
||||
sz += reflect.TypeOf(a).Size()
|
||||
sz *= 3
|
||||
sz += reflect.TypeOf(&a).Size()
|
||||
n := int(sz) * len(dict)
|
||||
fmt.Fprintf(b.w, "// Total size for %d entries: %d bytes (%d KB)\n\n", len(dict), n, n/1000)
|
||||
|
||||
b.w.Size += n
|
||||
}
|
||||
|
||||
// unique sorts the given lists and removes duplicate entries by swapping them
|
||||
// past position k, where k is the number of unique values. It returns k.
|
||||
func unique(a sort.Interface) int {
|
||||
if a.Len() == 0 {
|
||||
return 0
|
||||
}
|
||||
sort.Sort(a)
|
||||
k := 1
|
||||
for i := 1; i < a.Len(); i++ {
|
||||
if a.Less(k-1, i) {
|
||||
if k != i {
|
||||
a.Swap(k, i)
|
||||
}
|
||||
k++
|
||||
}
|
||||
}
|
||||
return k
|
||||
}
|
||||
+50345
File diff suppressed because it is too large
Load Diff
+20
@@ -0,0 +1,20 @@
|
||||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
// This file contains code common to the maketables.go and the package code.
|
||||
|
||||
// langAliasType is the type of an alias in langAliasMap.
|
||||
type langAliasType int8
|
||||
|
||||
const (
|
||||
langDeprecated langAliasType = iota
|
||||
langMacro
|
||||
langLegacy
|
||||
|
||||
langAliasTypeUnknown langAliasType = -1
|
||||
)
|
||||
+162
@@ -0,0 +1,162 @@
|
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
package main
|
||||
|
||||
// This file generates derivative tables based on the language package itself.
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"reflect"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/text/internal/gen"
|
||||
"golang.org/x/text/language"
|
||||
"golang.org/x/text/unicode/cldr"
|
||||
)
|
||||
|
||||
var (
|
||||
test = flag.Bool("test", false,
|
||||
"test existing tables; can be used to compare web data with package data.")
|
||||
|
||||
draft = flag.String("draft",
|
||||
"contributed",
|
||||
`Minimal draft requirements (approved, contributed, provisional, unconfirmed).`)
|
||||
)
|
||||
|
||||
func main() {
|
||||
gen.Init()
|
||||
|
||||
// Read the CLDR zip file.
|
||||
r := gen.OpenCLDRCoreZip()
|
||||
defer r.Close()
|
||||
|
||||
d := &cldr.Decoder{}
|
||||
data, err := d.DecodeZip(r)
|
||||
if err != nil {
|
||||
log.Fatalf("DecodeZip: %v", err)
|
||||
}
|
||||
|
||||
w := gen.NewCodeWriter()
|
||||
defer func() {
|
||||
buf := &bytes.Buffer{}
|
||||
|
||||
if _, err = w.WriteGo(buf, "language"); err != nil {
|
||||
log.Fatalf("Error formatting file index.go: %v", err)
|
||||
}
|
||||
|
||||
// Since we're generating a table for our own package we need to rewrite
|
||||
// doing the equivalent of go fmt -r 'language.b -> b'. Using
|
||||
// bytes.Replace will do.
|
||||
out := bytes.Replace(buf.Bytes(), []byte("language."), nil, -1)
|
||||
if err := ioutil.WriteFile("index.go", out, 0600); err != nil {
|
||||
log.Fatalf("Could not create file index.go: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
m := map[language.Tag]bool{}
|
||||
for _, lang := range data.Locales() {
|
||||
// We include all locales unconditionally to be consistent with en_US.
|
||||
// We want en_US, even though it has no data associated with it.
|
||||
|
||||
// TODO: put any of the languages for which no data exists at the end
|
||||
// of the index. This allows all components based on ICU to use that
|
||||
// as the cutoff point.
|
||||
// if x := data.RawLDML(lang); false ||
|
||||
// x.LocaleDisplayNames != nil ||
|
||||
// x.Characters != nil ||
|
||||
// x.Delimiters != nil ||
|
||||
// x.Measurement != nil ||
|
||||
// x.Dates != nil ||
|
||||
// x.Numbers != nil ||
|
||||
// x.Units != nil ||
|
||||
// x.ListPatterns != nil ||
|
||||
// x.Collations != nil ||
|
||||
// x.Segmentations != nil ||
|
||||
// x.Rbnf != nil ||
|
||||
// x.Annotations != nil ||
|
||||
// x.Metadata != nil {
|
||||
|
||||
// TODO: support POSIX natively, albeit non-standard.
|
||||
tag := language.Make(strings.Replace(lang, "_POSIX", "-u-va-posix", 1))
|
||||
m[tag] = true
|
||||
// }
|
||||
}
|
||||
// Include locales for plural rules, which uses a different structure.
|
||||
for _, plurals := range data.Supplemental().Plurals {
|
||||
for _, rules := range plurals.PluralRules {
|
||||
for _, lang := range strings.Split(rules.Locales, " ") {
|
||||
m[language.Make(lang)] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var core, special []language.Tag
|
||||
|
||||
for t := range m {
|
||||
if x := t.Extensions(); len(x) != 0 && fmt.Sprint(x) != "[u-va-posix]" {
|
||||
log.Fatalf("Unexpected extension %v in %v", x, t)
|
||||
}
|
||||
if len(t.Variants()) == 0 && len(t.Extensions()) == 0 {
|
||||
core = append(core, t)
|
||||
} else {
|
||||
special = append(special, t)
|
||||
}
|
||||
}
|
||||
|
||||
w.WriteComment(`
|
||||
NumCompactTags is the number of common tags. The maximum tag is
|
||||
NumCompactTags-1.`)
|
||||
w.WriteConst("NumCompactTags", len(core)+len(special))
|
||||
|
||||
sort.Sort(byAlpha(special))
|
||||
w.WriteVar("specialTags", special)
|
||||
|
||||
// TODO: order by frequency?
|
||||
sort.Sort(byAlpha(core))
|
||||
|
||||
// Size computations are just an estimate.
|
||||
w.Size += int(reflect.TypeOf(map[uint32]uint16{}).Size())
|
||||
w.Size += len(core) * 6 // size of uint32 and uint16
|
||||
|
||||
fmt.Fprintln(w)
|
||||
fmt.Fprintln(w, "var coreTags = map[uint32]uint16{")
|
||||
fmt.Fprintln(w, "0x0: 0, // und")
|
||||
i := len(special) + 1 // Und and special tags already written.
|
||||
for _, t := range core {
|
||||
if t == language.Und {
|
||||
continue
|
||||
}
|
||||
fmt.Fprint(w.Hash, t, i)
|
||||
b, s, r := t.Raw()
|
||||
fmt.Fprintf(w, "0x%s%s%s: %d, // %s\n",
|
||||
getIndex(b, 3), // 3 is enough as it is guaranteed to be a compact number
|
||||
getIndex(s, 2),
|
||||
getIndex(r, 3),
|
||||
i, t)
|
||||
i++
|
||||
}
|
||||
fmt.Fprintln(w, "}")
|
||||
}
|
||||
|
||||
// getIndex prints the subtag type and extracts its index of size nibble.
|
||||
// If the index is less than n nibbles, the result is prefixed with 0s.
|
||||
func getIndex(x interface{}, n int) string {
|
||||
s := fmt.Sprintf("%#v", x) // s is of form Type{typeID: 0x00}
|
||||
s = s[strings.Index(s, "0x")+2 : len(s)-1]
|
||||
return strings.Repeat("0", n-len(s)) + s
|
||||
}
|
||||
|
||||
type byAlpha []language.Tag
|
||||
|
||||
func (a byAlpha) Len() int { return len(a) }
|
||||
func (a byAlpha) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||
func (a byAlpha) Less(i, j int) bool { return a[i].String() < a[j].String() }
|
||||
+38
@@ -0,0 +1,38 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !go1.2
|
||||
|
||||
package language
|
||||
|
||||
import "sort"
|
||||
|
||||
func sortStable(s sort.Interface) {
|
||||
ss := stableSort{
|
||||
s: s,
|
||||
pos: make([]int, s.Len()),
|
||||
}
|
||||
for i := range ss.pos {
|
||||
ss.pos[i] = i
|
||||
}
|
||||
sort.Sort(&ss)
|
||||
}
|
||||
|
||||
type stableSort struct {
|
||||
s sort.Interface
|
||||
pos []int
|
||||
}
|
||||
|
||||
func (s *stableSort) Len() int {
|
||||
return len(s.pos)
|
||||
}
|
||||
|
||||
func (s *stableSort) Less(i, j int) bool {
|
||||
return s.s.Less(i, j) || !s.s.Less(j, i) && s.pos[i] < s.pos[j]
|
||||
}
|
||||
|
||||
func (s *stableSort) Swap(i, j int) {
|
||||
s.s.Swap(i, j)
|
||||
s.pos[i], s.pos[j] = s.pos[j], s.pos[i]
|
||||
}
|
||||
+11
@@ -0,0 +1,11 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build go1.2
|
||||
|
||||
package language
|
||||
|
||||
import "sort"
|
||||
|
||||
var sortStable = sort.Stable
|
||||
+767
@@ -0,0 +1,767 @@
|
||||
// This file was generated by go generate; DO NOT EDIT
|
||||
|
||||
package language
|
||||
|
||||
// NumCompactTags is the number of common tags. The maximum tag is
|
||||
// NumCompactTags-1.
|
||||
const NumCompactTags = 752
|
||||
|
||||
var specialTags = []Tag{ // 2 elements
|
||||
0: {lang: 0xd5, region: 0x6d, script: 0x0, pVariant: 0x5, pExt: 0xe, str: "ca-ES-valencia"},
|
||||
1: {lang: 0x134, region: 0x134, script: 0x0, pVariant: 0x5, pExt: 0x5, str: "en-US-u-va-posix"},
|
||||
} // Size: 72 bytes
|
||||
|
||||
var coreTags = map[uint32]uint16{
|
||||
0x0: 0, // und
|
||||
0x01500000: 3, // af
|
||||
0x015000d1: 4, // af-NA
|
||||
0x01500160: 5, // af-ZA
|
||||
0x01b00000: 6, // agq
|
||||
0x01b00051: 7, // agq-CM
|
||||
0x02000000: 8, // ak
|
||||
0x0200007f: 9, // ak-GH
|
||||
0x02600000: 10, // am
|
||||
0x0260006e: 11, // am-ET
|
||||
0x03900000: 12, // ar
|
||||
0x03900001: 13, // ar-001
|
||||
0x03900022: 14, // ar-AE
|
||||
0x03900038: 15, // ar-BH
|
||||
0x03900061: 16, // ar-DJ
|
||||
0x03900066: 17, // ar-DZ
|
||||
0x0390006a: 18, // ar-EG
|
||||
0x0390006b: 19, // ar-EH
|
||||
0x0390006c: 20, // ar-ER
|
||||
0x03900096: 21, // ar-IL
|
||||
0x0390009a: 22, // ar-IQ
|
||||
0x039000a0: 23, // ar-JO
|
||||
0x039000a7: 24, // ar-KM
|
||||
0x039000ab: 25, // ar-KW
|
||||
0x039000af: 26, // ar-LB
|
||||
0x039000b8: 27, // ar-LY
|
||||
0x039000b9: 28, // ar-MA
|
||||
0x039000c8: 29, // ar-MR
|
||||
0x039000e0: 30, // ar-OM
|
||||
0x039000ec: 31, // ar-PS
|
||||
0x039000f2: 32, // ar-QA
|
||||
0x03900107: 33, // ar-SA
|
||||
0x0390010a: 34, // ar-SD
|
||||
0x03900114: 35, // ar-SO
|
||||
0x03900116: 36, // ar-SS
|
||||
0x0390011b: 37, // ar-SY
|
||||
0x0390011f: 38, // ar-TD
|
||||
0x03900127: 39, // ar-TN
|
||||
0x0390015d: 40, // ar-YE
|
||||
0x03f00000: 41, // ars
|
||||
0x04200000: 42, // as
|
||||
0x04200098: 43, // as-IN
|
||||
0x04300000: 44, // asa
|
||||
0x0430012e: 45, // asa-TZ
|
||||
0x04700000: 46, // ast
|
||||
0x0470006d: 47, // ast-ES
|
||||
0x05700000: 48, // az
|
||||
0x0571e000: 49, // az-Cyrl
|
||||
0x0571e031: 50, // az-Cyrl-AZ
|
||||
0x05752000: 51, // az-Latn
|
||||
0x05752031: 52, // az-Latn-AZ
|
||||
0x05d00000: 53, // bas
|
||||
0x05d00051: 54, // bas-CM
|
||||
0x07000000: 55, // be
|
||||
0x07000046: 56, // be-BY
|
||||
0x07400000: 57, // bem
|
||||
0x07400161: 58, // bem-ZM
|
||||
0x07800000: 59, // bez
|
||||
0x0780012e: 60, // bez-TZ
|
||||
0x07d00000: 61, // bg
|
||||
0x07d00037: 62, // bg-BG
|
||||
0x08100000: 63, // bh
|
||||
0x09e00000: 64, // bm
|
||||
0x09e000c2: 65, // bm-ML
|
||||
0x0a300000: 66, // bn
|
||||
0x0a300034: 67, // bn-BD
|
||||
0x0a300098: 68, // bn-IN
|
||||
0x0a700000: 69, // bo
|
||||
0x0a700052: 70, // bo-CN
|
||||
0x0a700098: 71, // bo-IN
|
||||
0x0b000000: 72, // br
|
||||
0x0b000077: 73, // br-FR
|
||||
0x0b300000: 74, // brx
|
||||
0x0b300098: 75, // brx-IN
|
||||
0x0b500000: 76, // bs
|
||||
0x0b51e000: 77, // bs-Cyrl
|
||||
0x0b51e032: 78, // bs-Cyrl-BA
|
||||
0x0b552000: 79, // bs-Latn
|
||||
0x0b552032: 80, // bs-Latn-BA
|
||||
0x0d500000: 81, // ca
|
||||
0x0d500021: 82, // ca-AD
|
||||
0x0d50006d: 83, // ca-ES
|
||||
0x0d500077: 84, // ca-FR
|
||||
0x0d50009d: 85, // ca-IT
|
||||
0x0da00000: 86, // ce
|
||||
0x0da00105: 87, // ce-RU
|
||||
0x0dd00000: 88, // cgg
|
||||
0x0dd00130: 89, // cgg-UG
|
||||
0x0e300000: 90, // chr
|
||||
0x0e300134: 91, // chr-US
|
||||
0x0e700000: 92, // ckb
|
||||
0x0e70009a: 93, // ckb-IQ
|
||||
0x0e70009b: 94, // ckb-IR
|
||||
0x0f600000: 95, // cs
|
||||
0x0f60005d: 96, // cs-CZ
|
||||
0x0fa00000: 97, // cu
|
||||
0x0fa00105: 98, // cu-RU
|
||||
0x0fc00000: 99, // cy
|
||||
0x0fc0007a: 100, // cy-GB
|
||||
0x0fd00000: 101, // da
|
||||
0x0fd00062: 102, // da-DK
|
||||
0x0fd00081: 103, // da-GL
|
||||
0x10400000: 104, // dav
|
||||
0x104000a3: 105, // dav-KE
|
||||
0x10900000: 106, // de
|
||||
0x1090002d: 107, // de-AT
|
||||
0x10900035: 108, // de-BE
|
||||
0x1090004d: 109, // de-CH
|
||||
0x1090005f: 110, // de-DE
|
||||
0x1090009d: 111, // de-IT
|
||||
0x109000b1: 112, // de-LI
|
||||
0x109000b6: 113, // de-LU
|
||||
0x11300000: 114, // dje
|
||||
0x113000d3: 115, // dje-NE
|
||||
0x11b00000: 116, // dsb
|
||||
0x11b0005f: 117, // dsb-DE
|
||||
0x12000000: 118, // dua
|
||||
0x12000051: 119, // dua-CM
|
||||
0x12400000: 120, // dv
|
||||
0x12700000: 121, // dyo
|
||||
0x12700113: 122, // dyo-SN
|
||||
0x12900000: 123, // dz
|
||||
0x12900042: 124, // dz-BT
|
||||
0x12b00000: 125, // ebu
|
||||
0x12b000a3: 126, // ebu-KE
|
||||
0x12c00000: 127, // ee
|
||||
0x12c0007f: 128, // ee-GH
|
||||
0x12c00121: 129, // ee-TG
|
||||
0x13100000: 130, // el
|
||||
0x1310005c: 131, // el-CY
|
||||
0x13100086: 132, // el-GR
|
||||
0x13400000: 133, // en
|
||||
0x13400001: 134, // en-001
|
||||
0x1340001a: 135, // en-150
|
||||
0x13400024: 136, // en-AG
|
||||
0x13400025: 137, // en-AI
|
||||
0x1340002c: 138, // en-AS
|
||||
0x1340002d: 139, // en-AT
|
||||
0x1340002e: 140, // en-AU
|
||||
0x13400033: 141, // en-BB
|
||||
0x13400035: 142, // en-BE
|
||||
0x13400039: 143, // en-BI
|
||||
0x1340003c: 144, // en-BM
|
||||
0x13400041: 145, // en-BS
|
||||
0x13400045: 146, // en-BW
|
||||
0x13400047: 147, // en-BZ
|
||||
0x13400048: 148, // en-CA
|
||||
0x13400049: 149, // en-CC
|
||||
0x1340004d: 150, // en-CH
|
||||
0x1340004f: 151, // en-CK
|
||||
0x13400051: 152, // en-CM
|
||||
0x1340005b: 153, // en-CX
|
||||
0x1340005c: 154, // en-CY
|
||||
0x1340005f: 155, // en-DE
|
||||
0x13400060: 156, // en-DG
|
||||
0x13400062: 157, // en-DK
|
||||
0x13400063: 158, // en-DM
|
||||
0x1340006c: 159, // en-ER
|
||||
0x13400071: 160, // en-FI
|
||||
0x13400072: 161, // en-FJ
|
||||
0x13400073: 162, // en-FK
|
||||
0x13400074: 163, // en-FM
|
||||
0x1340007a: 164, // en-GB
|
||||
0x1340007b: 165, // en-GD
|
||||
0x1340007e: 166, // en-GG
|
||||
0x1340007f: 167, // en-GH
|
||||
0x13400080: 168, // en-GI
|
||||
0x13400082: 169, // en-GM
|
||||
0x13400089: 170, // en-GU
|
||||
0x1340008b: 171, // en-GY
|
||||
0x1340008c: 172, // en-HK
|
||||
0x13400095: 173, // en-IE
|
||||
0x13400096: 174, // en-IL
|
||||
0x13400097: 175, // en-IM
|
||||
0x13400098: 176, // en-IN
|
||||
0x13400099: 177, // en-IO
|
||||
0x1340009e: 178, // en-JE
|
||||
0x1340009f: 179, // en-JM
|
||||
0x134000a3: 180, // en-KE
|
||||
0x134000a6: 181, // en-KI
|
||||
0x134000a8: 182, // en-KN
|
||||
0x134000ac: 183, // en-KY
|
||||
0x134000b0: 184, // en-LC
|
||||
0x134000b3: 185, // en-LR
|
||||
0x134000b4: 186, // en-LS
|
||||
0x134000be: 187, // en-MG
|
||||
0x134000bf: 188, // en-MH
|
||||
0x134000c5: 189, // en-MO
|
||||
0x134000c6: 190, // en-MP
|
||||
0x134000c9: 191, // en-MS
|
||||
0x134000ca: 192, // en-MT
|
||||
0x134000cb: 193, // en-MU
|
||||
0x134000cd: 194, // en-MW
|
||||
0x134000cf: 195, // en-MY
|
||||
0x134000d1: 196, // en-NA
|
||||
0x134000d4: 197, // en-NF
|
||||
0x134000d5: 198, // en-NG
|
||||
0x134000d8: 199, // en-NL
|
||||
0x134000dc: 200, // en-NR
|
||||
0x134000de: 201, // en-NU
|
||||
0x134000df: 202, // en-NZ
|
||||
0x134000e5: 203, // en-PG
|
||||
0x134000e6: 204, // en-PH
|
||||
0x134000e7: 205, // en-PK
|
||||
0x134000ea: 206, // en-PN
|
||||
0x134000eb: 207, // en-PR
|
||||
0x134000ef: 208, // en-PW
|
||||
0x13400106: 209, // en-RW
|
||||
0x13400108: 210, // en-SB
|
||||
0x13400109: 211, // en-SC
|
||||
0x1340010a: 212, // en-SD
|
||||
0x1340010b: 213, // en-SE
|
||||
0x1340010c: 214, // en-SG
|
||||
0x1340010d: 215, // en-SH
|
||||
0x1340010e: 216, // en-SI
|
||||
0x13400111: 217, // en-SL
|
||||
0x13400116: 218, // en-SS
|
||||
0x1340011a: 219, // en-SX
|
||||
0x1340011c: 220, // en-SZ
|
||||
0x1340011e: 221, // en-TC
|
||||
0x13400124: 222, // en-TK
|
||||
0x13400128: 223, // en-TO
|
||||
0x1340012b: 224, // en-TT
|
||||
0x1340012c: 225, // en-TV
|
||||
0x1340012e: 226, // en-TZ
|
||||
0x13400130: 227, // en-UG
|
||||
0x13400132: 228, // en-UM
|
||||
0x13400134: 229, // en-US
|
||||
0x13400138: 230, // en-VC
|
||||
0x1340013b: 231, // en-VG
|
||||
0x1340013c: 232, // en-VI
|
||||
0x1340013e: 233, // en-VU
|
||||
0x13400141: 234, // en-WS
|
||||
0x13400160: 235, // en-ZA
|
||||
0x13400161: 236, // en-ZM
|
||||
0x13400163: 237, // en-ZW
|
||||
0x13700000: 238, // eo
|
||||
0x13700001: 239, // eo-001
|
||||
0x13900000: 240, // es
|
||||
0x1390001e: 241, // es-419
|
||||
0x1390002b: 242, // es-AR
|
||||
0x1390003e: 243, // es-BO
|
||||
0x13900040: 244, // es-BR
|
||||
0x13900050: 245, // es-CL
|
||||
0x13900053: 246, // es-CO
|
||||
0x13900055: 247, // es-CR
|
||||
0x13900058: 248, // es-CU
|
||||
0x13900064: 249, // es-DO
|
||||
0x13900067: 250, // es-EA
|
||||
0x13900068: 251, // es-EC
|
||||
0x1390006d: 252, // es-ES
|
||||
0x13900085: 253, // es-GQ
|
||||
0x13900088: 254, // es-GT
|
||||
0x1390008e: 255, // es-HN
|
||||
0x13900093: 256, // es-IC
|
||||
0x139000ce: 257, // es-MX
|
||||
0x139000d7: 258, // es-NI
|
||||
0x139000e1: 259, // es-PA
|
||||
0x139000e3: 260, // es-PE
|
||||
0x139000e6: 261, // es-PH
|
||||
0x139000eb: 262, // es-PR
|
||||
0x139000f0: 263, // es-PY
|
||||
0x13900119: 264, // es-SV
|
||||
0x13900134: 265, // es-US
|
||||
0x13900135: 266, // es-UY
|
||||
0x1390013a: 267, // es-VE
|
||||
0x13b00000: 268, // et
|
||||
0x13b00069: 269, // et-EE
|
||||
0x14000000: 270, // eu
|
||||
0x1400006d: 271, // eu-ES
|
||||
0x14100000: 272, // ewo
|
||||
0x14100051: 273, // ewo-CM
|
||||
0x14300000: 274, // fa
|
||||
0x14300023: 275, // fa-AF
|
||||
0x1430009b: 276, // fa-IR
|
||||
0x14900000: 277, // ff
|
||||
0x14900051: 278, // ff-CM
|
||||
0x14900083: 279, // ff-GN
|
||||
0x149000c8: 280, // ff-MR
|
||||
0x14900113: 281, // ff-SN
|
||||
0x14c00000: 282, // fi
|
||||
0x14c00071: 283, // fi-FI
|
||||
0x14e00000: 284, // fil
|
||||
0x14e000e6: 285, // fil-PH
|
||||
0x15300000: 286, // fo
|
||||
0x15300062: 287, // fo-DK
|
||||
0x15300075: 288, // fo-FO
|
||||
0x15900000: 289, // fr
|
||||
0x15900035: 290, // fr-BE
|
||||
0x15900036: 291, // fr-BF
|
||||
0x15900039: 292, // fr-BI
|
||||
0x1590003a: 293, // fr-BJ
|
||||
0x1590003b: 294, // fr-BL
|
||||
0x15900048: 295, // fr-CA
|
||||
0x1590004a: 296, // fr-CD
|
||||
0x1590004b: 297, // fr-CF
|
||||
0x1590004c: 298, // fr-CG
|
||||
0x1590004d: 299, // fr-CH
|
||||
0x1590004e: 300, // fr-CI
|
||||
0x15900051: 301, // fr-CM
|
||||
0x15900061: 302, // fr-DJ
|
||||
0x15900066: 303, // fr-DZ
|
||||
0x15900077: 304, // fr-FR
|
||||
0x15900079: 305, // fr-GA
|
||||
0x1590007d: 306, // fr-GF
|
||||
0x15900083: 307, // fr-GN
|
||||
0x15900084: 308, // fr-GP
|
||||
0x15900085: 309, // fr-GQ
|
||||
0x15900090: 310, // fr-HT
|
||||
0x159000a7: 311, // fr-KM
|
||||
0x159000b6: 312, // fr-LU
|
||||
0x159000b9: 313, // fr-MA
|
||||
0x159000ba: 314, // fr-MC
|
||||
0x159000bd: 315, // fr-MF
|
||||
0x159000be: 316, // fr-MG
|
||||
0x159000c2: 317, // fr-ML
|
||||
0x159000c7: 318, // fr-MQ
|
||||
0x159000c8: 319, // fr-MR
|
||||
0x159000cb: 320, // fr-MU
|
||||
0x159000d2: 321, // fr-NC
|
||||
0x159000d3: 322, // fr-NE
|
||||
0x159000e4: 323, // fr-PF
|
||||
0x159000e9: 324, // fr-PM
|
||||
0x15900101: 325, // fr-RE
|
||||
0x15900106: 326, // fr-RW
|
||||
0x15900109: 327, // fr-SC
|
||||
0x15900113: 328, // fr-SN
|
||||
0x1590011b: 329, // fr-SY
|
||||
0x1590011f: 330, // fr-TD
|
||||
0x15900121: 331, // fr-TG
|
||||
0x15900127: 332, // fr-TN
|
||||
0x1590013e: 333, // fr-VU
|
||||
0x1590013f: 334, // fr-WF
|
||||
0x1590015e: 335, // fr-YT
|
||||
0x16400000: 336, // fur
|
||||
0x1640009d: 337, // fur-IT
|
||||
0x16800000: 338, // fy
|
||||
0x168000d8: 339, // fy-NL
|
||||
0x16900000: 340, // ga
|
||||
0x16900095: 341, // ga-IE
|
||||
0x17800000: 342, // gd
|
||||
0x1780007a: 343, // gd-GB
|
||||
0x18a00000: 344, // gl
|
||||
0x18a0006d: 345, // gl-ES
|
||||
0x19c00000: 346, // gsw
|
||||
0x19c0004d: 347, // gsw-CH
|
||||
0x19c00077: 348, // gsw-FR
|
||||
0x19c000b1: 349, // gsw-LI
|
||||
0x19d00000: 350, // gu
|
||||
0x19d00098: 351, // gu-IN
|
||||
0x1a200000: 352, // guw
|
||||
0x1a400000: 353, // guz
|
||||
0x1a4000a3: 354, // guz-KE
|
||||
0x1a500000: 355, // gv
|
||||
0x1a500097: 356, // gv-IM
|
||||
0x1ad00000: 357, // ha
|
||||
0x1ad0007f: 358, // ha-GH
|
||||
0x1ad000d3: 359, // ha-NE
|
||||
0x1ad000d5: 360, // ha-NG
|
||||
0x1b100000: 361, // haw
|
||||
0x1b100134: 362, // haw-US
|
||||
0x1b500000: 363, // he
|
||||
0x1b500096: 364, // he-IL
|
||||
0x1b700000: 365, // hi
|
||||
0x1b700098: 366, // hi-IN
|
||||
0x1ca00000: 367, // hr
|
||||
0x1ca00032: 368, // hr-BA
|
||||
0x1ca0008f: 369, // hr-HR
|
||||
0x1cb00000: 370, // hsb
|
||||
0x1cb0005f: 371, // hsb-DE
|
||||
0x1ce00000: 372, // hu
|
||||
0x1ce00091: 373, // hu-HU
|
||||
0x1d000000: 374, // hy
|
||||
0x1d000027: 375, // hy-AM
|
||||
0x1da00000: 376, // id
|
||||
0x1da00094: 377, // id-ID
|
||||
0x1df00000: 378, // ig
|
||||
0x1df000d5: 379, // ig-NG
|
||||
0x1e200000: 380, // ii
|
||||
0x1e200052: 381, // ii-CN
|
||||
0x1f000000: 382, // is
|
||||
0x1f00009c: 383, // is-IS
|
||||
0x1f100000: 384, // it
|
||||
0x1f10004d: 385, // it-CH
|
||||
0x1f10009d: 386, // it-IT
|
||||
0x1f100112: 387, // it-SM
|
||||
0x1f200000: 388, // iu
|
||||
0x1f800000: 389, // ja
|
||||
0x1f8000a1: 390, // ja-JP
|
||||
0x1fb00000: 391, // jbo
|
||||
0x1ff00000: 392, // jgo
|
||||
0x1ff00051: 393, // jgo-CM
|
||||
0x20200000: 394, // jmc
|
||||
0x2020012e: 395, // jmc-TZ
|
||||
0x20600000: 396, // jv
|
||||
0x20800000: 397, // ka
|
||||
0x2080007c: 398, // ka-GE
|
||||
0x20a00000: 399, // kab
|
||||
0x20a00066: 400, // kab-DZ
|
||||
0x20e00000: 401, // kaj
|
||||
0x20f00000: 402, // kam
|
||||
0x20f000a3: 403, // kam-KE
|
||||
0x21700000: 404, // kcg
|
||||
0x21b00000: 405, // kde
|
||||
0x21b0012e: 406, // kde-TZ
|
||||
0x21f00000: 407, // kea
|
||||
0x21f00059: 408, // kea-CV
|
||||
0x22c00000: 409, // khq
|
||||
0x22c000c2: 410, // khq-ML
|
||||
0x23100000: 411, // ki
|
||||
0x231000a3: 412, // ki-KE
|
||||
0x23a00000: 413, // kk
|
||||
0x23a000ad: 414, // kk-KZ
|
||||
0x23c00000: 415, // kkj
|
||||
0x23c00051: 416, // kkj-CM
|
||||
0x23d00000: 417, // kl
|
||||
0x23d00081: 418, // kl-GL
|
||||
0x23e00000: 419, // kln
|
||||
0x23e000a3: 420, // kln-KE
|
||||
0x24200000: 421, // km
|
||||
0x242000a5: 422, // km-KH
|
||||
0x24900000: 423, // kn
|
||||
0x24900098: 424, // kn-IN
|
||||
0x24b00000: 425, // ko
|
||||
0x24b000a9: 426, // ko-KP
|
||||
0x24b000aa: 427, // ko-KR
|
||||
0x24d00000: 428, // kok
|
||||
0x24d00098: 429, // kok-IN
|
||||
0x26100000: 430, // ks
|
||||
0x26100098: 431, // ks-IN
|
||||
0x26200000: 432, // ksb
|
||||
0x2620012e: 433, // ksb-TZ
|
||||
0x26400000: 434, // ksf
|
||||
0x26400051: 435, // ksf-CM
|
||||
0x26500000: 436, // ksh
|
||||
0x2650005f: 437, // ksh-DE
|
||||
0x26b00000: 438, // ku
|
||||
0x27800000: 439, // kw
|
||||
0x2780007a: 440, // kw-GB
|
||||
0x28100000: 441, // ky
|
||||
0x281000a4: 442, // ky-KG
|
||||
0x28800000: 443, // lag
|
||||
0x2880012e: 444, // lag-TZ
|
||||
0x28c00000: 445, // lb
|
||||
0x28c000b6: 446, // lb-LU
|
||||
0x29a00000: 447, // lg
|
||||
0x29a00130: 448, // lg-UG
|
||||
0x2a600000: 449, // lkt
|
||||
0x2a600134: 450, // lkt-US
|
||||
0x2ac00000: 451, // ln
|
||||
0x2ac00029: 452, // ln-AO
|
||||
0x2ac0004a: 453, // ln-CD
|
||||
0x2ac0004b: 454, // ln-CF
|
||||
0x2ac0004c: 455, // ln-CG
|
||||
0x2af00000: 456, // lo
|
||||
0x2af000ae: 457, // lo-LA
|
||||
0x2b600000: 458, // lrc
|
||||
0x2b60009a: 459, // lrc-IQ
|
||||
0x2b60009b: 460, // lrc-IR
|
||||
0x2b700000: 461, // lt
|
||||
0x2b7000b5: 462, // lt-LT
|
||||
0x2b900000: 463, // lu
|
||||
0x2b90004a: 464, // lu-CD
|
||||
0x2bb00000: 465, // luo
|
||||
0x2bb000a3: 466, // luo-KE
|
||||
0x2bc00000: 467, // luy
|
||||
0x2bc000a3: 468, // luy-KE
|
||||
0x2be00000: 469, // lv
|
||||
0x2be000b7: 470, // lv-LV
|
||||
0x2c800000: 471, // mas
|
||||
0x2c8000a3: 472, // mas-KE
|
||||
0x2c80012e: 473, // mas-TZ
|
||||
0x2e000000: 474, // mer
|
||||
0x2e0000a3: 475, // mer-KE
|
||||
0x2e400000: 476, // mfe
|
||||
0x2e4000cb: 477, // mfe-MU
|
||||
0x2e800000: 478, // mg
|
||||
0x2e8000be: 479, // mg-MG
|
||||
0x2e900000: 480, // mgh
|
||||
0x2e9000d0: 481, // mgh-MZ
|
||||
0x2eb00000: 482, // mgo
|
||||
0x2eb00051: 483, // mgo-CM
|
||||
0x2f600000: 484, // mk
|
||||
0x2f6000c1: 485, // mk-MK
|
||||
0x2fb00000: 486, // ml
|
||||
0x2fb00098: 487, // ml-IN
|
||||
0x30200000: 488, // mn
|
||||
0x302000c4: 489, // mn-MN
|
||||
0x31200000: 490, // mr
|
||||
0x31200098: 491, // mr-IN
|
||||
0x31600000: 492, // ms
|
||||
0x3160003d: 493, // ms-BN
|
||||
0x316000cf: 494, // ms-MY
|
||||
0x3160010c: 495, // ms-SG
|
||||
0x31700000: 496, // mt
|
||||
0x317000ca: 497, // mt-MT
|
||||
0x31c00000: 498, // mua
|
||||
0x31c00051: 499, // mua-CM
|
||||
0x32800000: 500, // my
|
||||
0x328000c3: 501, // my-MM
|
||||
0x33100000: 502, // mzn
|
||||
0x3310009b: 503, // mzn-IR
|
||||
0x33800000: 504, // nah
|
||||
0x33c00000: 505, // naq
|
||||
0x33c000d1: 506, // naq-NA
|
||||
0x33e00000: 507, // nb
|
||||
0x33e000d9: 508, // nb-NO
|
||||
0x33e0010f: 509, // nb-SJ
|
||||
0x34500000: 510, // nd
|
||||
0x34500163: 511, // nd-ZW
|
||||
0x34700000: 512, // nds
|
||||
0x3470005f: 513, // nds-DE
|
||||
0x347000d8: 514, // nds-NL
|
||||
0x34800000: 515, // ne
|
||||
0x34800098: 516, // ne-IN
|
||||
0x348000da: 517, // ne-NP
|
||||
0x35e00000: 518, // nl
|
||||
0x35e0002f: 519, // nl-AW
|
||||
0x35e00035: 520, // nl-BE
|
||||
0x35e0003f: 521, // nl-BQ
|
||||
0x35e0005a: 522, // nl-CW
|
||||
0x35e000d8: 523, // nl-NL
|
||||
0x35e00115: 524, // nl-SR
|
||||
0x35e0011a: 525, // nl-SX
|
||||
0x35f00000: 526, // nmg
|
||||
0x35f00051: 527, // nmg-CM
|
||||
0x36100000: 528, // nn
|
||||
0x361000d9: 529, // nn-NO
|
||||
0x36300000: 530, // nnh
|
||||
0x36300051: 531, // nnh-CM
|
||||
0x36600000: 532, // no
|
||||
0x36c00000: 533, // nqo
|
||||
0x36d00000: 534, // nr
|
||||
0x37100000: 535, // nso
|
||||
0x37700000: 536, // nus
|
||||
0x37700116: 537, // nus-SS
|
||||
0x37e00000: 538, // ny
|
||||
0x38000000: 539, // nyn
|
||||
0x38000130: 540, // nyn-UG
|
||||
0x38700000: 541, // om
|
||||
0x3870006e: 542, // om-ET
|
||||
0x387000a3: 543, // om-KE
|
||||
0x38c00000: 544, // or
|
||||
0x38c00098: 545, // or-IN
|
||||
0x38f00000: 546, // os
|
||||
0x38f0007c: 547, // os-GE
|
||||
0x38f00105: 548, // os-RU
|
||||
0x39400000: 549, // pa
|
||||
0x39405000: 550, // pa-Arab
|
||||
0x394050e7: 551, // pa-Arab-PK
|
||||
0x3942f000: 552, // pa-Guru
|
||||
0x3942f098: 553, // pa-Guru-IN
|
||||
0x39800000: 554, // pap
|
||||
0x3aa00000: 555, // pl
|
||||
0x3aa000e8: 556, // pl-PL
|
||||
0x3b400000: 557, // prg
|
||||
0x3b400001: 558, // prg-001
|
||||
0x3b500000: 559, // ps
|
||||
0x3b500023: 560, // ps-AF
|
||||
0x3b700000: 561, // pt
|
||||
0x3b700029: 562, // pt-AO
|
||||
0x3b700040: 563, // pt-BR
|
||||
0x3b70004d: 564, // pt-CH
|
||||
0x3b700059: 565, // pt-CV
|
||||
0x3b700085: 566, // pt-GQ
|
||||
0x3b70008a: 567, // pt-GW
|
||||
0x3b7000b6: 568, // pt-LU
|
||||
0x3b7000c5: 569, // pt-MO
|
||||
0x3b7000d0: 570, // pt-MZ
|
||||
0x3b7000ed: 571, // pt-PT
|
||||
0x3b700117: 572, // pt-ST
|
||||
0x3b700125: 573, // pt-TL
|
||||
0x3bb00000: 574, // qu
|
||||
0x3bb0003e: 575, // qu-BO
|
||||
0x3bb00068: 576, // qu-EC
|
||||
0x3bb000e3: 577, // qu-PE
|
||||
0x3cb00000: 578, // rm
|
||||
0x3cb0004d: 579, // rm-CH
|
||||
0x3d000000: 580, // rn
|
||||
0x3d000039: 581, // rn-BI
|
||||
0x3d300000: 582, // ro
|
||||
0x3d3000bb: 583, // ro-MD
|
||||
0x3d300103: 584, // ro-RO
|
||||
0x3d500000: 585, // rof
|
||||
0x3d50012e: 586, // rof-TZ
|
||||
0x3d900000: 587, // ru
|
||||
0x3d900046: 588, // ru-BY
|
||||
0x3d9000a4: 589, // ru-KG
|
||||
0x3d9000ad: 590, // ru-KZ
|
||||
0x3d9000bb: 591, // ru-MD
|
||||
0x3d900105: 592, // ru-RU
|
||||
0x3d90012f: 593, // ru-UA
|
||||
0x3dc00000: 594, // rw
|
||||
0x3dc00106: 595, // rw-RW
|
||||
0x3dd00000: 596, // rwk
|
||||
0x3dd0012e: 597, // rwk-TZ
|
||||
0x3e200000: 598, // sah
|
||||
0x3e200105: 599, // sah-RU
|
||||
0x3e300000: 600, // saq
|
||||
0x3e3000a3: 601, // saq-KE
|
||||
0x3e900000: 602, // sbp
|
||||
0x3e90012e: 603, // sbp-TZ
|
||||
0x3f200000: 604, // sdh
|
||||
0x3f300000: 605, // se
|
||||
0x3f300071: 606, // se-FI
|
||||
0x3f3000d9: 607, // se-NO
|
||||
0x3f30010b: 608, // se-SE
|
||||
0x3f500000: 609, // seh
|
||||
0x3f5000d0: 610, // seh-MZ
|
||||
0x3f700000: 611, // ses
|
||||
0x3f7000c2: 612, // ses-ML
|
||||
0x3f800000: 613, // sg
|
||||
0x3f80004b: 614, // sg-CF
|
||||
0x3fe00000: 615, // shi
|
||||
0x3fe52000: 616, // shi-Latn
|
||||
0x3fe520b9: 617, // shi-Latn-MA
|
||||
0x3fed2000: 618, // shi-Tfng
|
||||
0x3fed20b9: 619, // shi-Tfng-MA
|
||||
0x40200000: 620, // si
|
||||
0x402000b2: 621, // si-LK
|
||||
0x40800000: 622, // sk
|
||||
0x40800110: 623, // sk-SK
|
||||
0x40c00000: 624, // sl
|
||||
0x40c0010e: 625, // sl-SI
|
||||
0x41200000: 626, // sma
|
||||
0x41300000: 627, // smi
|
||||
0x41400000: 628, // smj
|
||||
0x41500000: 629, // smn
|
||||
0x41500071: 630, // smn-FI
|
||||
0x41800000: 631, // sms
|
||||
0x41900000: 632, // sn
|
||||
0x41900163: 633, // sn-ZW
|
||||
0x41f00000: 634, // so
|
||||
0x41f00061: 635, // so-DJ
|
||||
0x41f0006e: 636, // so-ET
|
||||
0x41f000a3: 637, // so-KE
|
||||
0x41f00114: 638, // so-SO
|
||||
0x42700000: 639, // sq
|
||||
0x42700026: 640, // sq-AL
|
||||
0x427000c1: 641, // sq-MK
|
||||
0x4270014c: 642, // sq-XK
|
||||
0x42800000: 643, // sr
|
||||
0x4281e000: 644, // sr-Cyrl
|
||||
0x4281e032: 645, // sr-Cyrl-BA
|
||||
0x4281e0bc: 646, // sr-Cyrl-ME
|
||||
0x4281e104: 647, // sr-Cyrl-RS
|
||||
0x4281e14c: 648, // sr-Cyrl-XK
|
||||
0x42852000: 649, // sr-Latn
|
||||
0x42852032: 650, // sr-Latn-BA
|
||||
0x428520bc: 651, // sr-Latn-ME
|
||||
0x42852104: 652, // sr-Latn-RS
|
||||
0x4285214c: 653, // sr-Latn-XK
|
||||
0x42d00000: 654, // ss
|
||||
0x43000000: 655, // ssy
|
||||
0x43100000: 656, // st
|
||||
0x43a00000: 657, // sv
|
||||
0x43a00030: 658, // sv-AX
|
||||
0x43a00071: 659, // sv-FI
|
||||
0x43a0010b: 660, // sv-SE
|
||||
0x43b00000: 661, // sw
|
||||
0x43b0004a: 662, // sw-CD
|
||||
0x43b000a3: 663, // sw-KE
|
||||
0x43b0012e: 664, // sw-TZ
|
||||
0x43b00130: 665, // sw-UG
|
||||
0x44400000: 666, // syr
|
||||
0x44600000: 667, // ta
|
||||
0x44600098: 668, // ta-IN
|
||||
0x446000b2: 669, // ta-LK
|
||||
0x446000cf: 670, // ta-MY
|
||||
0x4460010c: 671, // ta-SG
|
||||
0x45700000: 672, // te
|
||||
0x45700098: 673, // te-IN
|
||||
0x45a00000: 674, // teo
|
||||
0x45a000a3: 675, // teo-KE
|
||||
0x45a00130: 676, // teo-UG
|
||||
0x46100000: 677, // th
|
||||
0x46100122: 678, // th-TH
|
||||
0x46500000: 679, // ti
|
||||
0x4650006c: 680, // ti-ER
|
||||
0x4650006e: 681, // ti-ET
|
||||
0x46700000: 682, // tig
|
||||
0x46c00000: 683, // tk
|
||||
0x46c00126: 684, // tk-TM
|
||||
0x47600000: 685, // tn
|
||||
0x47800000: 686, // to
|
||||
0x47800128: 687, // to-TO
|
||||
0x48000000: 688, // tr
|
||||
0x4800005c: 689, // tr-CY
|
||||
0x4800012a: 690, // tr-TR
|
||||
0x48400000: 691, // ts
|
||||
0x49a00000: 692, // twq
|
||||
0x49a000d3: 693, // twq-NE
|
||||
0x49f00000: 694, // tzm
|
||||
0x49f000b9: 695, // tzm-MA
|
||||
0x4a200000: 696, // ug
|
||||
0x4a200052: 697, // ug-CN
|
||||
0x4a400000: 698, // uk
|
||||
0x4a40012f: 699, // uk-UA
|
||||
0x4aa00000: 700, // ur
|
||||
0x4aa00098: 701, // ur-IN
|
||||
0x4aa000e7: 702, // ur-PK
|
||||
0x4b200000: 703, // uz
|
||||
0x4b205000: 704, // uz-Arab
|
||||
0x4b205023: 705, // uz-Arab-AF
|
||||
0x4b21e000: 706, // uz-Cyrl
|
||||
0x4b21e136: 707, // uz-Cyrl-UZ
|
||||
0x4b252000: 708, // uz-Latn
|
||||
0x4b252136: 709, // uz-Latn-UZ
|
||||
0x4b400000: 710, // vai
|
||||
0x4b452000: 711, // vai-Latn
|
||||
0x4b4520b3: 712, // vai-Latn-LR
|
||||
0x4b4d9000: 713, // vai-Vaii
|
||||
0x4b4d90b3: 714, // vai-Vaii-LR
|
||||
0x4b600000: 715, // ve
|
||||
0x4b900000: 716, // vi
|
||||
0x4b90013d: 717, // vi-VN
|
||||
0x4bf00000: 718, // vo
|
||||
0x4bf00001: 719, // vo-001
|
||||
0x4c200000: 720, // vun
|
||||
0x4c20012e: 721, // vun-TZ
|
||||
0x4c400000: 722, // wa
|
||||
0x4c500000: 723, // wae
|
||||
0x4c50004d: 724, // wae-CH
|
||||
0x4db00000: 725, // wo
|
||||
0x4e800000: 726, // xh
|
||||
0x4f100000: 727, // xog
|
||||
0x4f100130: 728, // xog-UG
|
||||
0x4ff00000: 729, // yav
|
||||
0x4ff00051: 730, // yav-CM
|
||||
0x50800000: 731, // yi
|
||||
0x50800001: 732, // yi-001
|
||||
0x50e00000: 733, // yo
|
||||
0x50e0003a: 734, // yo-BJ
|
||||
0x50e000d5: 735, // yo-NG
|
||||
0x51500000: 736, // yue
|
||||
0x5150008c: 737, // yue-HK
|
||||
0x51e00000: 738, // zgh
|
||||
0x51e000b9: 739, // zgh-MA
|
||||
0x51f00000: 740, // zh
|
||||
0x51f34000: 741, // zh-Hans
|
||||
0x51f34052: 742, // zh-Hans-CN
|
||||
0x51f3408c: 743, // zh-Hans-HK
|
||||
0x51f340c5: 744, // zh-Hans-MO
|
||||
0x51f3410c: 745, // zh-Hans-SG
|
||||
0x51f35000: 746, // zh-Hant
|
||||
0x51f3508c: 747, // zh-Hant-HK
|
||||
0x51f350c5: 748, // zh-Hant-MO
|
||||
0x51f3512d: 749, // zh-Hant-TW
|
||||
0x52400000: 750, // zu
|
||||
0x52400160: 751, // zu-ZA
|
||||
}
|
||||
|
||||
// Total table size 4580 bytes (4KiB); checksum: A7F72A2A
|
||||
+975
@@ -0,0 +1,975 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:generate go run maketables.go gen_common.go -output tables.go
|
||||
//go:generate go run gen_index.go
|
||||
|
||||
// Package language implements BCP 47 language tags and related functionality.
|
||||
//
|
||||
// The Tag type, which is used to represent languages, is agnostic to the
|
||||
// meaning of its subtags. Tags are not fully canonicalized to preserve
|
||||
// information that may be valuable in certain contexts. As a consequence, two
|
||||
// different tags may represent identical languages.
|
||||
//
|
||||
// Initializing language- or locale-specific components usually consists of
|
||||
// two steps. The first step is to select a display language based on the
|
||||
// preferred languages of the user and the languages supported by an application.
|
||||
// The second step is to create the language-specific services based on
|
||||
// this selection. Each is discussed in more details below.
|
||||
//
|
||||
// Matching preferred against supported languages
|
||||
//
|
||||
// An application may support various languages. This list is typically limited
|
||||
// by the languages for which there exists translations of the user interface.
|
||||
// Similarly, a user may provide a list of preferred languages which is limited
|
||||
// by the languages understood by this user.
|
||||
// An application should use a Matcher to find the best supported language based
|
||||
// on the user's preferred list.
|
||||
// Matchers are aware of the intricacies of equivalence between languages.
|
||||
// The default Matcher implementation takes into account things such as
|
||||
// deprecated subtags, legacy tags, and mutual intelligibility between scripts
|
||||
// and languages.
|
||||
//
|
||||
// A Matcher for English, Australian English, Danish, and standard Mandarin can
|
||||
// be defined as follows:
|
||||
//
|
||||
// var matcher = language.NewMatcher([]language.Tag{
|
||||
// language.English, // The first language is used as fallback.
|
||||
// language.MustParse("en-AU"),
|
||||
// language.Danish,
|
||||
// language.Chinese,
|
||||
// })
|
||||
//
|
||||
// The following code selects the best match for someone speaking Spanish and
|
||||
// Norwegian:
|
||||
//
|
||||
// preferred := []language.Tag{ language.Spanish, language.Norwegian }
|
||||
// tag, _, _ := matcher.Match(preferred...)
|
||||
//
|
||||
// In this case, the best match is Danish, as Danish is sufficiently a match to
|
||||
// Norwegian to not have to fall back to the default.
|
||||
// See ParseAcceptLanguage on how to handle the Accept-Language HTTP header.
|
||||
//
|
||||
// Selecting language-specific services
|
||||
//
|
||||
// One should always use the Tag returned by the Matcher to create an instance
|
||||
// of any of the language-specific services provided by the text repository.
|
||||
// This prevents the mixing of languages, such as having a different language for
|
||||
// messages and display names, as well as improper casing or sorting order for
|
||||
// the selected language.
|
||||
// Using the returned Tag also allows user-defined settings, such as collation
|
||||
// order or numbering system to be transparently passed as options.
|
||||
//
|
||||
// If you have language-specific data in your application, however, it will in
|
||||
// most cases suffice to use the index returned by the matcher to identify
|
||||
// the user language.
|
||||
// The following loop provides an alternative in case this is not sufficient:
|
||||
//
|
||||
// supported := map[language.Tag]data{
|
||||
// language.English: enData,
|
||||
// language.MustParse("en-AU"): enAUData,
|
||||
// language.Danish: daData,
|
||||
// language.Chinese: zhData,
|
||||
// }
|
||||
// tag, _, _ := matcher.Match(preferred...)
|
||||
// for ; tag != language.Und; tag = tag.Parent() {
|
||||
// if v, ok := supported[tag]; ok {
|
||||
// return v
|
||||
// }
|
||||
// }
|
||||
// return enData // should not reach here
|
||||
//
|
||||
// Repeatedly taking the Parent of the tag returned by Match will eventually
|
||||
// match one of the tags used to initialize the Matcher.
|
||||
//
|
||||
// Canonicalization
|
||||
//
|
||||
// By default, only legacy and deprecated tags are converted into their
|
||||
// canonical equivalent. All other information is preserved. This approach makes
|
||||
// the confidence scores more accurate and allows matchers to distinguish
|
||||
// between variants that are otherwise lost.
|
||||
//
|
||||
// As a consequence, two tags that should be treated as identical according to
|
||||
// BCP 47 or CLDR, like "en-Latn" and "en", will be represented differently. The
|
||||
// Matchers will handle such distinctions, though, and are aware of the
|
||||
// equivalence relations. The CanonType type can be used to alter the
|
||||
// canonicalization form.
|
||||
//
|
||||
// References
|
||||
//
|
||||
// BCP 47 - Tags for Identifying Languages
|
||||
// http://tools.ietf.org/html/bcp47
|
||||
package language // import "golang.org/x/text/language"
|
||||
|
||||
// TODO: Remove above NOTE after:
|
||||
// - verifying that tables are dropped correctly (most notably matcher tables).
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const (
|
||||
// maxCoreSize is the maximum size of a BCP 47 tag without variants and
|
||||
// extensions. Equals max lang (3) + script (4) + max reg (3) + 2 dashes.
|
||||
maxCoreSize = 12
|
||||
|
||||
// max99thPercentileSize is a somewhat arbitrary buffer size that presumably
|
||||
// is large enough to hold at least 99% of the BCP 47 tags.
|
||||
max99thPercentileSize = 32
|
||||
|
||||
// maxSimpleUExtensionSize is the maximum size of a -u extension with one
|
||||
// key-type pair. Equals len("-u-") + key (2) + dash + max value (8).
|
||||
maxSimpleUExtensionSize = 14
|
||||
)
|
||||
|
||||
// Tag represents a BCP 47 language tag. It is used to specify an instance of a
|
||||
// specific language or locale. All language tag values are guaranteed to be
|
||||
// well-formed.
|
||||
type Tag struct {
|
||||
lang langID
|
||||
region regionID
|
||||
script scriptID
|
||||
pVariant byte // offset in str, includes preceding '-'
|
||||
pExt uint16 // offset of first extension, includes preceding '-'
|
||||
|
||||
// str is the string representation of the Tag. It will only be used if the
|
||||
// tag has variants or extensions.
|
||||
str string
|
||||
}
|
||||
|
||||
// Make is a convenience wrapper for Parse that omits the error.
|
||||
// In case of an error, a sensible default is returned.
|
||||
func Make(s string) Tag {
|
||||
return Default.Make(s)
|
||||
}
|
||||
|
||||
// Make is a convenience wrapper for c.Parse that omits the error.
|
||||
// In case of an error, a sensible default is returned.
|
||||
func (c CanonType) Make(s string) Tag {
|
||||
t, _ := c.Parse(s)
|
||||
return t
|
||||
}
|
||||
|
||||
// Raw returns the raw base language, script and region, without making an
|
||||
// attempt to infer their values.
|
||||
func (t Tag) Raw() (b Base, s Script, r Region) {
|
||||
return Base{t.lang}, Script{t.script}, Region{t.region}
|
||||
}
|
||||
|
||||
// equalTags compares language, script and region subtags only.
|
||||
func (t Tag) equalTags(a Tag) bool {
|
||||
return t.lang == a.lang && t.script == a.script && t.region == a.region
|
||||
}
|
||||
|
||||
// IsRoot returns true if t is equal to language "und".
|
||||
func (t Tag) IsRoot() bool {
|
||||
if int(t.pVariant) < len(t.str) {
|
||||
return false
|
||||
}
|
||||
return t.equalTags(und)
|
||||
}
|
||||
|
||||
// private reports whether the Tag consists solely of a private use tag.
|
||||
func (t Tag) private() bool {
|
||||
return t.str != "" && t.pVariant == 0
|
||||
}
|
||||
|
||||
// CanonType can be used to enable or disable various types of canonicalization.
|
||||
type CanonType int
|
||||
|
||||
const (
|
||||
// Replace deprecated base languages with their preferred replacements.
|
||||
DeprecatedBase CanonType = 1 << iota
|
||||
// Replace deprecated scripts with their preferred replacements.
|
||||
DeprecatedScript
|
||||
// Replace deprecated regions with their preferred replacements.
|
||||
DeprecatedRegion
|
||||
// Remove redundant scripts.
|
||||
SuppressScript
|
||||
// Normalize legacy encodings. This includes legacy languages defined in
|
||||
// CLDR as well as bibliographic codes defined in ISO-639.
|
||||
Legacy
|
||||
// Map the dominant language of a macro language group to the macro language
|
||||
// subtag. For example cmn -> zh.
|
||||
Macro
|
||||
// The CLDR flag should be used if full compatibility with CLDR is required.
|
||||
// There are a few cases where language.Tag may differ from CLDR. To follow all
|
||||
// of CLDR's suggestions, use All|CLDR.
|
||||
CLDR
|
||||
|
||||
// Raw can be used to Compose or Parse without Canonicalization.
|
||||
Raw CanonType = 0
|
||||
|
||||
// Replace all deprecated tags with their preferred replacements.
|
||||
Deprecated = DeprecatedBase | DeprecatedScript | DeprecatedRegion
|
||||
|
||||
// All canonicalizations recommended by BCP 47.
|
||||
BCP47 = Deprecated | SuppressScript
|
||||
|
||||
// All canonicalizations.
|
||||
All = BCP47 | Legacy | Macro
|
||||
|
||||
// Default is the canonicalization used by Parse, Make and Compose. To
|
||||
// preserve as much information as possible, canonicalizations that remove
|
||||
// potentially valuable information are not included. The Matcher is
|
||||
// designed to recognize similar tags that would be the same if
|
||||
// they were canonicalized using All.
|
||||
Default = Deprecated | Legacy
|
||||
|
||||
canonLang = DeprecatedBase | Legacy | Macro
|
||||
|
||||
// TODO: LikelyScript, LikelyRegion: suppress similar to ICU.
|
||||
)
|
||||
|
||||
// canonicalize returns the canonicalized equivalent of the tag and
|
||||
// whether there was any change.
|
||||
func (t Tag) canonicalize(c CanonType) (Tag, bool) {
|
||||
if c == Raw {
|
||||
return t, false
|
||||
}
|
||||
changed := false
|
||||
if c&SuppressScript != 0 {
|
||||
if t.lang < langNoIndexOffset && uint8(t.script) == suppressScript[t.lang] {
|
||||
t.script = 0
|
||||
changed = true
|
||||
}
|
||||
}
|
||||
if c&canonLang != 0 {
|
||||
for {
|
||||
if l, aliasType := normLang(t.lang); l != t.lang {
|
||||
switch aliasType {
|
||||
case langLegacy:
|
||||
if c&Legacy != 0 {
|
||||
if t.lang == _sh && t.script == 0 {
|
||||
t.script = _Latn
|
||||
}
|
||||
t.lang = l
|
||||
changed = true
|
||||
}
|
||||
case langMacro:
|
||||
if c&Macro != 0 {
|
||||
// We deviate here from CLDR. The mapping "nb" -> "no"
|
||||
// qualifies as a typical Macro language mapping. However,
|
||||
// for legacy reasons, CLDR maps "no", the macro language
|
||||
// code for Norwegian, to the dominant variant "nb". This
|
||||
// change is currently under consideration for CLDR as well.
|
||||
// See http://unicode.org/cldr/trac/ticket/2698 and also
|
||||
// http://unicode.org/cldr/trac/ticket/1790 for some of the
|
||||
// practical implications. TODO: this check could be removed
|
||||
// if CLDR adopts this change.
|
||||
if c&CLDR == 0 || t.lang != _nb {
|
||||
changed = true
|
||||
t.lang = l
|
||||
}
|
||||
}
|
||||
case langDeprecated:
|
||||
if c&DeprecatedBase != 0 {
|
||||
if t.lang == _mo && t.region == 0 {
|
||||
t.region = _MD
|
||||
}
|
||||
t.lang = l
|
||||
changed = true
|
||||
// Other canonicalization types may still apply.
|
||||
continue
|
||||
}
|
||||
}
|
||||
} else if c&Legacy != 0 && t.lang == _no && c&CLDR != 0 {
|
||||
t.lang = _nb
|
||||
changed = true
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
if c&DeprecatedScript != 0 {
|
||||
if t.script == _Qaai {
|
||||
changed = true
|
||||
t.script = _Zinh
|
||||
}
|
||||
}
|
||||
if c&DeprecatedRegion != 0 {
|
||||
if r := normRegion(t.region); r != 0 {
|
||||
changed = true
|
||||
t.region = r
|
||||
}
|
||||
}
|
||||
return t, changed
|
||||
}
|
||||
|
||||
// Canonicalize returns the canonicalized equivalent of the tag.
|
||||
func (c CanonType) Canonicalize(t Tag) (Tag, error) {
|
||||
t, changed := t.canonicalize(c)
|
||||
if changed {
|
||||
t.remakeString()
|
||||
}
|
||||
return t, nil
|
||||
}
|
||||
|
||||
// Confidence indicates the level of certainty for a given return value.
|
||||
// For example, Serbian may be written in Cyrillic or Latin script.
|
||||
// The confidence level indicates whether a value was explicitly specified,
|
||||
// whether it is typically the only possible value, or whether there is
|
||||
// an ambiguity.
|
||||
type Confidence int
|
||||
|
||||
const (
|
||||
No Confidence = iota // full confidence that there was no match
|
||||
Low // most likely value picked out of a set of alternatives
|
||||
High // value is generally assumed to be the correct match
|
||||
Exact // exact match or explicitly specified value
|
||||
)
|
||||
|
||||
var confName = []string{"No", "Low", "High", "Exact"}
|
||||
|
||||
func (c Confidence) String() string {
|
||||
return confName[c]
|
||||
}
|
||||
|
||||
// remakeString is used to update t.str in case lang, script or region changed.
|
||||
// It is assumed that pExt and pVariant still point to the start of the
|
||||
// respective parts.
|
||||
func (t *Tag) remakeString() {
|
||||
if t.str == "" {
|
||||
return
|
||||
}
|
||||
extra := t.str[t.pVariant:]
|
||||
if t.pVariant > 0 {
|
||||
extra = extra[1:]
|
||||
}
|
||||
if t.equalTags(und) && strings.HasPrefix(extra, "x-") {
|
||||
t.str = extra
|
||||
t.pVariant = 0
|
||||
t.pExt = 0
|
||||
return
|
||||
}
|
||||
var buf [max99thPercentileSize]byte // avoid extra memory allocation in most cases.
|
||||
b := buf[:t.genCoreBytes(buf[:])]
|
||||
if extra != "" {
|
||||
diff := len(b) - int(t.pVariant)
|
||||
b = append(b, '-')
|
||||
b = append(b, extra...)
|
||||
t.pVariant = uint8(int(t.pVariant) + diff)
|
||||
t.pExt = uint16(int(t.pExt) + diff)
|
||||
} else {
|
||||
t.pVariant = uint8(len(b))
|
||||
t.pExt = uint16(len(b))
|
||||
}
|
||||
t.str = string(b)
|
||||
}
|
||||
|
||||
// genCoreBytes writes a string for the base languages, script and region tags
|
||||
// to the given buffer and returns the number of bytes written. It will never
|
||||
// write more than maxCoreSize bytes.
|
||||
func (t *Tag) genCoreBytes(buf []byte) int {
|
||||
n := t.lang.stringToBuf(buf[:])
|
||||
if t.script != 0 {
|
||||
n += copy(buf[n:], "-")
|
||||
n += copy(buf[n:], t.script.String())
|
||||
}
|
||||
if t.region != 0 {
|
||||
n += copy(buf[n:], "-")
|
||||
n += copy(buf[n:], t.region.String())
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// String returns the canonical string representation of the language tag.
|
||||
func (t Tag) String() string {
|
||||
if t.str != "" {
|
||||
return t.str
|
||||
}
|
||||
if t.script == 0 && t.region == 0 {
|
||||
return t.lang.String()
|
||||
}
|
||||
buf := [maxCoreSize]byte{}
|
||||
return string(buf[:t.genCoreBytes(buf[:])])
|
||||
}
|
||||
|
||||
// Base returns the base language of the language tag. If the base language is
|
||||
// unspecified, an attempt will be made to infer it from the context.
|
||||
// It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
|
||||
func (t Tag) Base() (Base, Confidence) {
|
||||
if t.lang != 0 {
|
||||
return Base{t.lang}, Exact
|
||||
}
|
||||
c := High
|
||||
if t.script == 0 && !(Region{t.region}).IsCountry() {
|
||||
c = Low
|
||||
}
|
||||
if tag, err := addTags(t); err == nil && tag.lang != 0 {
|
||||
return Base{tag.lang}, c
|
||||
}
|
||||
return Base{0}, No
|
||||
}
|
||||
|
||||
// Script infers the script for the language tag. If it was not explicitly given, it will infer
|
||||
// a most likely candidate.
|
||||
// If more than one script is commonly used for a language, the most likely one
|
||||
// is returned with a low confidence indication. For example, it returns (Cyrl, Low)
|
||||
// for Serbian.
|
||||
// If a script cannot be inferred (Zzzz, No) is returned. We do not use Zyyy (undetermined)
|
||||
// as one would suspect from the IANA registry for BCP 47. In a Unicode context Zyyy marks
|
||||
// common characters (like 1, 2, 3, '.', etc.) and is therefore more like multiple scripts.
|
||||
// See http://www.unicode.org/reports/tr24/#Values for more details. Zzzz is also used for
|
||||
// unknown value in CLDR. (Zzzz, Exact) is returned if Zzzz was explicitly specified.
|
||||
// Note that an inferred script is never guaranteed to be the correct one. Latin is
|
||||
// almost exclusively used for Afrikaans, but Arabic has been used for some texts
|
||||
// in the past. Also, the script that is commonly used may change over time.
|
||||
// It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
|
||||
func (t Tag) Script() (Script, Confidence) {
|
||||
if t.script != 0 {
|
||||
return Script{t.script}, Exact
|
||||
}
|
||||
sc, c := scriptID(_Zzzz), No
|
||||
if t.lang < langNoIndexOffset {
|
||||
if scr := scriptID(suppressScript[t.lang]); scr != 0 {
|
||||
// Note: it is not always the case that a language with a suppress
|
||||
// script value is only written in one script (e.g. kk, ms, pa).
|
||||
if t.region == 0 {
|
||||
return Script{scriptID(scr)}, High
|
||||
}
|
||||
sc, c = scr, High
|
||||
}
|
||||
}
|
||||
if tag, err := addTags(t); err == nil {
|
||||
if tag.script != sc {
|
||||
sc, c = tag.script, Low
|
||||
}
|
||||
} else {
|
||||
t, _ = (Deprecated | Macro).Canonicalize(t)
|
||||
if tag, err := addTags(t); err == nil && tag.script != sc {
|
||||
sc, c = tag.script, Low
|
||||
}
|
||||
}
|
||||
return Script{sc}, c
|
||||
}
|
||||
|
||||
// Region returns the region for the language tag. If it was not explicitly given, it will
|
||||
// infer a most likely candidate from the context.
|
||||
// It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
|
||||
func (t Tag) Region() (Region, Confidence) {
|
||||
if t.region != 0 {
|
||||
return Region{t.region}, Exact
|
||||
}
|
||||
if t, err := addTags(t); err == nil {
|
||||
return Region{t.region}, Low // TODO: differentiate between high and low.
|
||||
}
|
||||
t, _ = (Deprecated | Macro).Canonicalize(t)
|
||||
if tag, err := addTags(t); err == nil {
|
||||
return Region{tag.region}, Low
|
||||
}
|
||||
return Region{_ZZ}, No // TODO: return world instead of undetermined?
|
||||
}
|
||||
|
||||
// Variant returns the variants specified explicitly for this language tag.
|
||||
// or nil if no variant was specified.
|
||||
func (t Tag) Variants() []Variant {
|
||||
v := []Variant{}
|
||||
if int(t.pVariant) < int(t.pExt) {
|
||||
for x, str := "", t.str[t.pVariant:t.pExt]; str != ""; {
|
||||
x, str = nextToken(str)
|
||||
v = append(v, Variant{x})
|
||||
}
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
// Parent returns the CLDR parent of t. In CLDR, missing fields in data for a
|
||||
// specific language are substituted with fields from the parent language.
|
||||
// The parent for a language may change for newer versions of CLDR.
|
||||
func (t Tag) Parent() Tag {
|
||||
if t.str != "" {
|
||||
// Strip the variants and extensions.
|
||||
t, _ = Raw.Compose(t.Raw())
|
||||
if t.region == 0 && t.script != 0 && t.lang != 0 {
|
||||
base, _ := addTags(Tag{lang: t.lang})
|
||||
if base.script == t.script {
|
||||
return Tag{lang: t.lang}
|
||||
}
|
||||
}
|
||||
return t
|
||||
}
|
||||
if t.lang != 0 {
|
||||
if t.region != 0 {
|
||||
maxScript := t.script
|
||||
if maxScript == 0 {
|
||||
max, _ := addTags(t)
|
||||
maxScript = max.script
|
||||
}
|
||||
|
||||
for i := range parents {
|
||||
if langID(parents[i].lang) == t.lang && scriptID(parents[i].maxScript) == maxScript {
|
||||
for _, r := range parents[i].fromRegion {
|
||||
if regionID(r) == t.region {
|
||||
return Tag{
|
||||
lang: t.lang,
|
||||
script: scriptID(parents[i].script),
|
||||
region: regionID(parents[i].toRegion),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Strip the script if it is the default one.
|
||||
base, _ := addTags(Tag{lang: t.lang})
|
||||
if base.script != maxScript {
|
||||
return Tag{lang: t.lang, script: maxScript}
|
||||
}
|
||||
return Tag{lang: t.lang}
|
||||
} else if t.script != 0 {
|
||||
// The parent for an base-script pair with a non-default script is
|
||||
// "und" instead of the base language.
|
||||
base, _ := addTags(Tag{lang: t.lang})
|
||||
if base.script != t.script {
|
||||
return und
|
||||
}
|
||||
return Tag{lang: t.lang}
|
||||
}
|
||||
}
|
||||
return und
|
||||
}
|
||||
|
||||
// returns token t and the rest of the string.
|
||||
func nextToken(s string) (t, tail string) {
|
||||
p := strings.Index(s[1:], "-")
|
||||
if p == -1 {
|
||||
return s[1:], ""
|
||||
}
|
||||
p++
|
||||
return s[1:p], s[p:]
|
||||
}
|
||||
|
||||
// Extension is a single BCP 47 extension.
|
||||
type Extension struct {
|
||||
s string
|
||||
}
|
||||
|
||||
// String returns the string representation of the extension, including the
|
||||
// type tag.
|
||||
func (e Extension) String() string {
|
||||
return e.s
|
||||
}
|
||||
|
||||
// ParseExtension parses s as an extension and returns it on success.
|
||||
func ParseExtension(s string) (e Extension, err error) {
|
||||
scan := makeScannerString(s)
|
||||
var end int
|
||||
if n := len(scan.token); n != 1 {
|
||||
return Extension{}, errSyntax
|
||||
}
|
||||
scan.toLower(0, len(scan.b))
|
||||
end = parseExtension(&scan)
|
||||
if end != len(s) {
|
||||
return Extension{}, errSyntax
|
||||
}
|
||||
return Extension{string(scan.b)}, nil
|
||||
}
|
||||
|
||||
// Type returns the one-byte extension type of e. It returns 0 for the zero
|
||||
// exception.
|
||||
func (e Extension) Type() byte {
|
||||
if e.s == "" {
|
||||
return 0
|
||||
}
|
||||
return e.s[0]
|
||||
}
|
||||
|
||||
// Tokens returns the list of tokens of e.
|
||||
func (e Extension) Tokens() []string {
|
||||
return strings.Split(e.s, "-")
|
||||
}
|
||||
|
||||
// Extension returns the extension of type x for tag t. It will return
|
||||
// false for ok if t does not have the requested extension. The returned
|
||||
// extension will be invalid in this case.
|
||||
func (t Tag) Extension(x byte) (ext Extension, ok bool) {
|
||||
for i := int(t.pExt); i < len(t.str)-1; {
|
||||
var ext string
|
||||
i, ext = getExtension(t.str, i)
|
||||
if ext[0] == x {
|
||||
return Extension{ext}, true
|
||||
}
|
||||
}
|
||||
return Extension{}, false
|
||||
}
|
||||
|
||||
// Extensions returns all extensions of t.
|
||||
func (t Tag) Extensions() []Extension {
|
||||
e := []Extension{}
|
||||
for i := int(t.pExt); i < len(t.str)-1; {
|
||||
var ext string
|
||||
i, ext = getExtension(t.str, i)
|
||||
e = append(e, Extension{ext})
|
||||
}
|
||||
return e
|
||||
}
|
||||
|
||||
// TypeForKey returns the type associated with the given key, where key and type
|
||||
// are of the allowed values defined for the Unicode locale extension ('u') in
|
||||
// http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
|
||||
// TypeForKey will traverse the inheritance chain to get the correct value.
|
||||
func (t Tag) TypeForKey(key string) string {
|
||||
if start, end, _ := t.findTypeForKey(key); end != start {
|
||||
return t.str[start:end]
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
var (
|
||||
errPrivateUse = errors.New("cannot set a key on a private use tag")
|
||||
errInvalidArguments = errors.New("invalid key or type")
|
||||
)
|
||||
|
||||
// SetTypeForKey returns a new Tag with the key set to type, where key and type
|
||||
// are of the allowed values defined for the Unicode locale extension ('u') in
|
||||
// http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
|
||||
// An empty value removes an existing pair with the same key.
|
||||
func (t Tag) SetTypeForKey(key, value string) (Tag, error) {
|
||||
if t.private() {
|
||||
return t, errPrivateUse
|
||||
}
|
||||
if len(key) != 2 {
|
||||
return t, errInvalidArguments
|
||||
}
|
||||
|
||||
// Remove the setting if value is "".
|
||||
if value == "" {
|
||||
start, end, _ := t.findTypeForKey(key)
|
||||
if start != end {
|
||||
// Remove key tag and leading '-'.
|
||||
start -= 4
|
||||
|
||||
// Remove a possible empty extension.
|
||||
if (end == len(t.str) || t.str[end+2] == '-') && t.str[start-2] == '-' {
|
||||
start -= 2
|
||||
}
|
||||
if start == int(t.pVariant) && end == len(t.str) {
|
||||
t.str = ""
|
||||
t.pVariant, t.pExt = 0, 0
|
||||
} else {
|
||||
t.str = fmt.Sprintf("%s%s", t.str[:start], t.str[end:])
|
||||
}
|
||||
}
|
||||
return t, nil
|
||||
}
|
||||
|
||||
if len(value) < 3 || len(value) > 8 {
|
||||
return t, errInvalidArguments
|
||||
}
|
||||
|
||||
var (
|
||||
buf [maxCoreSize + maxSimpleUExtensionSize]byte
|
||||
uStart int // start of the -u extension.
|
||||
)
|
||||
|
||||
// Generate the tag string if needed.
|
||||
if t.str == "" {
|
||||
uStart = t.genCoreBytes(buf[:])
|
||||
buf[uStart] = '-'
|
||||
uStart++
|
||||
}
|
||||
|
||||
// Create new key-type pair and parse it to verify.
|
||||
b := buf[uStart:]
|
||||
copy(b, "u-")
|
||||
copy(b[2:], key)
|
||||
b[4] = '-'
|
||||
b = b[:5+copy(b[5:], value)]
|
||||
scan := makeScanner(b)
|
||||
if parseExtensions(&scan); scan.err != nil {
|
||||
return t, scan.err
|
||||
}
|
||||
|
||||
// Assemble the replacement string.
|
||||
if t.str == "" {
|
||||
t.pVariant, t.pExt = byte(uStart-1), uint16(uStart-1)
|
||||
t.str = string(buf[:uStart+len(b)])
|
||||
} else {
|
||||
s := t.str
|
||||
start, end, hasExt := t.findTypeForKey(key)
|
||||
if start == end {
|
||||
if hasExt {
|
||||
b = b[2:]
|
||||
}
|
||||
t.str = fmt.Sprintf("%s-%s%s", s[:start], b, s[end:])
|
||||
} else {
|
||||
t.str = fmt.Sprintf("%s%s%s", s[:start], value, s[end:])
|
||||
}
|
||||
}
|
||||
return t, nil
|
||||
}
|
||||
|
||||
// findKeyAndType returns the start and end position for the type corresponding
|
||||
// to key or the point at which to insert the key-value pair if the type
|
||||
// wasn't found. The hasExt return value reports whether an -u extension was present.
|
||||
// Note: the extensions are typically very small and are likely to contain
|
||||
// only one key-type pair.
|
||||
func (t Tag) findTypeForKey(key string) (start, end int, hasExt bool) {
|
||||
p := int(t.pExt)
|
||||
if len(key) != 2 || p == len(t.str) || p == 0 {
|
||||
return p, p, false
|
||||
}
|
||||
s := t.str
|
||||
|
||||
// Find the correct extension.
|
||||
for p++; s[p] != 'u'; p++ {
|
||||
if s[p] > 'u' {
|
||||
p--
|
||||
return p, p, false
|
||||
}
|
||||
if p = nextExtension(s, p); p == len(s) {
|
||||
return len(s), len(s), false
|
||||
}
|
||||
}
|
||||
// Proceed to the hyphen following the extension name.
|
||||
p++
|
||||
|
||||
// curKey is the key currently being processed.
|
||||
curKey := ""
|
||||
|
||||
// Iterate over keys until we get the end of a section.
|
||||
for {
|
||||
// p points to the hyphen preceding the current token.
|
||||
if p3 := p + 3; s[p3] == '-' {
|
||||
// Found a key.
|
||||
// Check whether we just processed the key that was requested.
|
||||
if curKey == key {
|
||||
return start, p, true
|
||||
}
|
||||
// Set to the next key and continue scanning type tokens.
|
||||
curKey = s[p+1 : p3]
|
||||
if curKey > key {
|
||||
return p, p, true
|
||||
}
|
||||
// Start of the type token sequence.
|
||||
start = p + 4
|
||||
// A type is at least 3 characters long.
|
||||
p += 7 // 4 + 3
|
||||
} else {
|
||||
// Attribute or type, which is at least 3 characters long.
|
||||
p += 4
|
||||
}
|
||||
// p points past the third character of a type or attribute.
|
||||
max := p + 5 // maximum length of token plus hyphen.
|
||||
if len(s) < max {
|
||||
max = len(s)
|
||||
}
|
||||
for ; p < max && s[p] != '-'; p++ {
|
||||
}
|
||||
// Bail if we have exhausted all tokens or if the next token starts
|
||||
// a new extension.
|
||||
if p == len(s) || s[p+2] == '-' {
|
||||
if curKey == key {
|
||||
return start, p, true
|
||||
}
|
||||
return p, p, true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// CompactIndex returns an index, where 0 <= index < NumCompactTags, for tags
|
||||
// for which data exists in the text repository. The index will change over time
|
||||
// and should not be stored in persistent storage. Extensions, except for the
|
||||
// 'va' type of the 'u' extension, are ignored. It will return 0, false if no
|
||||
// compact tag exists, where 0 is the index for the root language (Und).
|
||||
func CompactIndex(t Tag) (index int, ok bool) {
|
||||
// TODO: perhaps give more frequent tags a lower index.
|
||||
// TODO: we could make the indexes stable. This will excluded some
|
||||
// possibilities for optimization, so don't do this quite yet.
|
||||
b, s, r := t.Raw()
|
||||
if len(t.str) > 0 {
|
||||
if strings.HasPrefix(t.str, "x-") {
|
||||
// We have no entries for user-defined tags.
|
||||
return 0, false
|
||||
}
|
||||
if uint16(t.pVariant) != t.pExt {
|
||||
// There are no tags with variants and an u-va type.
|
||||
if t.TypeForKey("va") != "" {
|
||||
return 0, false
|
||||
}
|
||||
t, _ = Raw.Compose(b, s, r, t.Variants())
|
||||
} else if _, ok := t.Extension('u'); ok {
|
||||
// Strip all but the 'va' entry.
|
||||
variant := t.TypeForKey("va")
|
||||
t, _ = Raw.Compose(b, s, r)
|
||||
t, _ = t.SetTypeForKey("va", variant)
|
||||
}
|
||||
if len(t.str) > 0 {
|
||||
// We have some variants.
|
||||
for i, s := range specialTags {
|
||||
if s == t {
|
||||
return i + 1, true
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
// No variants specified: just compare core components.
|
||||
// The key has the form lllssrrr, where l, s, and r are nibbles for
|
||||
// respectively the langID, scriptID, and regionID.
|
||||
key := uint32(b.langID) << (8 + 12)
|
||||
key |= uint32(s.scriptID) << 12
|
||||
key |= uint32(r.regionID)
|
||||
x, ok := coreTags[key]
|
||||
return int(x), ok
|
||||
}
|
||||
|
||||
// Base is an ISO 639 language code, used for encoding the base language
|
||||
// of a language tag.
|
||||
type Base struct {
|
||||
langID
|
||||
}
|
||||
|
||||
// ParseBase parses a 2- or 3-letter ISO 639 code.
|
||||
// It returns a ValueError if s is a well-formed but unknown language identifier
|
||||
// or another error if another error occurred.
|
||||
func ParseBase(s string) (Base, error) {
|
||||
if n := len(s); n < 2 || 3 < n {
|
||||
return Base{}, errSyntax
|
||||
}
|
||||
var buf [3]byte
|
||||
l, err := getLangID(buf[:copy(buf[:], s)])
|
||||
return Base{l}, err
|
||||
}
|
||||
|
||||
// Script is a 4-letter ISO 15924 code for representing scripts.
|
||||
// It is idiomatically represented in title case.
|
||||
type Script struct {
|
||||
scriptID
|
||||
}
|
||||
|
||||
// ParseScript parses a 4-letter ISO 15924 code.
|
||||
// It returns a ValueError if s is a well-formed but unknown script identifier
|
||||
// or another error if another error occurred.
|
||||
func ParseScript(s string) (Script, error) {
|
||||
if len(s) != 4 {
|
||||
return Script{}, errSyntax
|
||||
}
|
||||
var buf [4]byte
|
||||
sc, err := getScriptID(script, buf[:copy(buf[:], s)])
|
||||
return Script{sc}, err
|
||||
}
|
||||
|
||||
// Region is an ISO 3166-1 or UN M.49 code for representing countries and regions.
|
||||
type Region struct {
|
||||
regionID
|
||||
}
|
||||
|
||||
// EncodeM49 returns the Region for the given UN M.49 code.
|
||||
// It returns an error if r is not a valid code.
|
||||
func EncodeM49(r int) (Region, error) {
|
||||
rid, err := getRegionM49(r)
|
||||
return Region{rid}, err
|
||||
}
|
||||
|
||||
// ParseRegion parses a 2- or 3-letter ISO 3166-1 or a UN M.49 code.
|
||||
// It returns a ValueError if s is a well-formed but unknown region identifier
|
||||
// or another error if another error occurred.
|
||||
func ParseRegion(s string) (Region, error) {
|
||||
if n := len(s); n < 2 || 3 < n {
|
||||
return Region{}, errSyntax
|
||||
}
|
||||
var buf [3]byte
|
||||
r, err := getRegionID(buf[:copy(buf[:], s)])
|
||||
return Region{r}, err
|
||||
}
|
||||
|
||||
// IsCountry returns whether this region is a country or autonomous area. This
|
||||
// includes non-standard definitions from CLDR.
|
||||
func (r Region) IsCountry() bool {
|
||||
if r.regionID == 0 || r.IsGroup() || r.IsPrivateUse() && r.regionID != _XK {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// IsGroup returns whether this region defines a collection of regions. This
|
||||
// includes non-standard definitions from CLDR.
|
||||
func (r Region) IsGroup() bool {
|
||||
if r.regionID == 0 {
|
||||
return false
|
||||
}
|
||||
return int(regionInclusion[r.regionID]) < len(regionContainment)
|
||||
}
|
||||
|
||||
// Contains returns whether Region c is contained by Region r. It returns true
|
||||
// if c == r.
|
||||
func (r Region) Contains(c Region) bool {
|
||||
return r.regionID.contains(c.regionID)
|
||||
}
|
||||
|
||||
func (r regionID) contains(c regionID) bool {
|
||||
if r == c {
|
||||
return true
|
||||
}
|
||||
g := regionInclusion[r]
|
||||
if g >= nRegionGroups {
|
||||
return false
|
||||
}
|
||||
m := regionContainment[g]
|
||||
|
||||
d := regionInclusion[c]
|
||||
b := regionInclusionBits[d]
|
||||
|
||||
// A contained country may belong to multiple disjoint groups. Matching any
|
||||
// of these indicates containment. If the contained region is a group, it
|
||||
// must strictly be a subset.
|
||||
if d >= nRegionGroups {
|
||||
return b&m != 0
|
||||
}
|
||||
return b&^m == 0
|
||||
}
|
||||
|
||||
var errNoTLD = errors.New("language: region is not a valid ccTLD")
|
||||
|
||||
// TLD returns the country code top-level domain (ccTLD). UK is returned for GB.
|
||||
// In all other cases it returns either the region itself or an error.
|
||||
//
|
||||
// This method may return an error for a region for which there exists a
|
||||
// canonical form with a ccTLD. To get that ccTLD canonicalize r first. The
|
||||
// region will already be canonicalized it was obtained from a Tag that was
|
||||
// obtained using any of the default methods.
|
||||
func (r Region) TLD() (Region, error) {
|
||||
// See http://en.wikipedia.org/wiki/Country_code_top-level_domain for the
|
||||
// difference between ISO 3166-1 and IANA ccTLD.
|
||||
if r.regionID == _GB {
|
||||
r = Region{_UK}
|
||||
}
|
||||
if (r.typ() & ccTLD) == 0 {
|
||||
return Region{}, errNoTLD
|
||||
}
|
||||
return r, nil
|
||||
}
|
||||
|
||||
// Canonicalize returns the region or a possible replacement if the region is
|
||||
// deprecated. It will not return a replacement for deprecated regions that
|
||||
// are split into multiple regions.
|
||||
func (r Region) Canonicalize() Region {
|
||||
if cr := normRegion(r.regionID); cr != 0 {
|
||||
return Region{cr}
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
// Variant represents a registered variant of a language as defined by BCP 47.
|
||||
type Variant struct {
|
||||
variant string
|
||||
}
|
||||
|
||||
// ParseVariant parses and returns a Variant. An error is returned if s is not
|
||||
// a valid variant.
|
||||
func ParseVariant(s string) (Variant, error) {
|
||||
s = strings.ToLower(s)
|
||||
if _, ok := variantIndex[s]; ok {
|
||||
return Variant{s}, nil
|
||||
}
|
||||
return Variant{}, mkErrInvalid([]byte(s))
|
||||
}
|
||||
|
||||
// String returns the string representation of the variant.
|
||||
func (v Variant) String() string {
|
||||
return v.variant
|
||||
}
|
||||
+396
@@ -0,0 +1,396 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package language
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"sort"
|
||||
"strconv"
|
||||
|
||||
"golang.org/x/text/internal/tag"
|
||||
)
|
||||
|
||||
// findIndex tries to find the given tag in idx and returns a standardized error
|
||||
// if it could not be found.
|
||||
func findIndex(idx tag.Index, key []byte, form string) (index int, err error) {
|
||||
if !tag.FixCase(form, key) {
|
||||
return 0, errSyntax
|
||||
}
|
||||
i := idx.Index(key)
|
||||
if i == -1 {
|
||||
return 0, mkErrInvalid(key)
|
||||
}
|
||||
return i, nil
|
||||
}
|
||||
|
||||
func searchUint(imap []uint16, key uint16) int {
|
||||
return sort.Search(len(imap), func(i int) bool {
|
||||
return imap[i] >= key
|
||||
})
|
||||
}
|
||||
|
||||
type langID uint16
|
||||
|
||||
// getLangID returns the langID of s if s is a canonical subtag
|
||||
// or langUnknown if s is not a canonical subtag.
|
||||
func getLangID(s []byte) (langID, error) {
|
||||
if len(s) == 2 {
|
||||
return getLangISO2(s)
|
||||
}
|
||||
return getLangISO3(s)
|
||||
}
|
||||
|
||||
// mapLang returns the mapped langID of id according to mapping m.
|
||||
func normLang(id langID) (langID, langAliasType) {
|
||||
k := sort.Search(len(langAliasMap), func(i int) bool {
|
||||
return langAliasMap[i].from >= uint16(id)
|
||||
})
|
||||
if k < len(langAliasMap) && langAliasMap[k].from == uint16(id) {
|
||||
return langID(langAliasMap[k].to), langAliasTypes[k]
|
||||
}
|
||||
return id, langAliasTypeUnknown
|
||||
}
|
||||
|
||||
// getLangISO2 returns the langID for the given 2-letter ISO language code
|
||||
// or unknownLang if this does not exist.
|
||||
func getLangISO2(s []byte) (langID, error) {
|
||||
if !tag.FixCase("zz", s) {
|
||||
return 0, errSyntax
|
||||
}
|
||||
if i := lang.Index(s); i != -1 && lang.Elem(i)[3] != 0 {
|
||||
return langID(i), nil
|
||||
}
|
||||
return 0, mkErrInvalid(s)
|
||||
}
|
||||
|
||||
const base = 'z' - 'a' + 1
|
||||
|
||||
func strToInt(s []byte) uint {
|
||||
v := uint(0)
|
||||
for i := 0; i < len(s); i++ {
|
||||
v *= base
|
||||
v += uint(s[i] - 'a')
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
// converts the given integer to the original ASCII string passed to strToInt.
|
||||
// len(s) must match the number of characters obtained.
|
||||
func intToStr(v uint, s []byte) {
|
||||
for i := len(s) - 1; i >= 0; i-- {
|
||||
s[i] = byte(v%base) + 'a'
|
||||
v /= base
|
||||
}
|
||||
}
|
||||
|
||||
// getLangISO3 returns the langID for the given 3-letter ISO language code
|
||||
// or unknownLang if this does not exist.
|
||||
func getLangISO3(s []byte) (langID, error) {
|
||||
if tag.FixCase("und", s) {
|
||||
// first try to match canonical 3-letter entries
|
||||
for i := lang.Index(s[:2]); i != -1; i = lang.Next(s[:2], i) {
|
||||
if e := lang.Elem(i); e[3] == 0 && e[2] == s[2] {
|
||||
// We treat "und" as special and always translate it to "unspecified".
|
||||
// Note that ZZ and Zzzz are private use and are not treated as
|
||||
// unspecified by default.
|
||||
id := langID(i)
|
||||
if id == nonCanonicalUnd {
|
||||
return 0, nil
|
||||
}
|
||||
return id, nil
|
||||
}
|
||||
}
|
||||
if i := altLangISO3.Index(s); i != -1 {
|
||||
return langID(altLangIndex[altLangISO3.Elem(i)[3]]), nil
|
||||
}
|
||||
n := strToInt(s)
|
||||
if langNoIndex[n/8]&(1<<(n%8)) != 0 {
|
||||
return langID(n) + langNoIndexOffset, nil
|
||||
}
|
||||
// Check for non-canonical uses of ISO3.
|
||||
for i := lang.Index(s[:1]); i != -1; i = lang.Next(s[:1], i) {
|
||||
if e := lang.Elem(i); e[2] == s[1] && e[3] == s[2] {
|
||||
return langID(i), nil
|
||||
}
|
||||
}
|
||||
return 0, mkErrInvalid(s)
|
||||
}
|
||||
return 0, errSyntax
|
||||
}
|
||||
|
||||
// stringToBuf writes the string to b and returns the number of bytes
|
||||
// written. cap(b) must be >= 3.
|
||||
func (id langID) stringToBuf(b []byte) int {
|
||||
if id >= langNoIndexOffset {
|
||||
intToStr(uint(id)-langNoIndexOffset, b[:3])
|
||||
return 3
|
||||
} else if id == 0 {
|
||||
return copy(b, "und")
|
||||
}
|
||||
l := lang[id<<2:]
|
||||
if l[3] == 0 {
|
||||
return copy(b, l[:3])
|
||||
}
|
||||
return copy(b, l[:2])
|
||||
}
|
||||
|
||||
// String returns the BCP 47 representation of the langID.
|
||||
// Use b as variable name, instead of id, to ensure the variable
|
||||
// used is consistent with that of Base in which this type is embedded.
|
||||
func (b langID) String() string {
|
||||
if b == 0 {
|
||||
return "und"
|
||||
} else if b >= langNoIndexOffset {
|
||||
b -= langNoIndexOffset
|
||||
buf := [3]byte{}
|
||||
intToStr(uint(b), buf[:])
|
||||
return string(buf[:])
|
||||
}
|
||||
l := lang.Elem(int(b))
|
||||
if l[3] == 0 {
|
||||
return l[:3]
|
||||
}
|
||||
return l[:2]
|
||||
}
|
||||
|
||||
// ISO3 returns the ISO 639-3 language code.
|
||||
func (b langID) ISO3() string {
|
||||
if b == 0 || b >= langNoIndexOffset {
|
||||
return b.String()
|
||||
}
|
||||
l := lang.Elem(int(b))
|
||||
if l[3] == 0 {
|
||||
return l[:3]
|
||||
} else if l[2] == 0 {
|
||||
return altLangISO3.Elem(int(l[3]))[:3]
|
||||
}
|
||||
// This allocation will only happen for 3-letter ISO codes
|
||||
// that are non-canonical BCP 47 language identifiers.
|
||||
return l[0:1] + l[2:4]
|
||||
}
|
||||
|
||||
// IsPrivateUse reports whether this language code is reserved for private use.
|
||||
func (b langID) IsPrivateUse() bool {
|
||||
return langPrivateStart <= b && b <= langPrivateEnd
|
||||
}
|
||||
|
||||
type regionID uint16
|
||||
|
||||
// getRegionID returns the region id for s if s is a valid 2-letter region code
|
||||
// or unknownRegion.
|
||||
func getRegionID(s []byte) (regionID, error) {
|
||||
if len(s) == 3 {
|
||||
if isAlpha(s[0]) {
|
||||
return getRegionISO3(s)
|
||||
}
|
||||
if i, err := strconv.ParseUint(string(s), 10, 10); err == nil {
|
||||
return getRegionM49(int(i))
|
||||
}
|
||||
}
|
||||
return getRegionISO2(s)
|
||||
}
|
||||
|
||||
// getRegionISO2 returns the regionID for the given 2-letter ISO country code
|
||||
// or unknownRegion if this does not exist.
|
||||
func getRegionISO2(s []byte) (regionID, error) {
|
||||
i, err := findIndex(regionISO, s, "ZZ")
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return regionID(i) + isoRegionOffset, nil
|
||||
}
|
||||
|
||||
// getRegionISO3 returns the regionID for the given 3-letter ISO country code
|
||||
// or unknownRegion if this does not exist.
|
||||
func getRegionISO3(s []byte) (regionID, error) {
|
||||
if tag.FixCase("ZZZ", s) {
|
||||
for i := regionISO.Index(s[:1]); i != -1; i = regionISO.Next(s[:1], i) {
|
||||
if e := regionISO.Elem(i); e[2] == s[1] && e[3] == s[2] {
|
||||
return regionID(i) + isoRegionOffset, nil
|
||||
}
|
||||
}
|
||||
for i := 0; i < len(altRegionISO3); i += 3 {
|
||||
if tag.Compare(altRegionISO3[i:i+3], s) == 0 {
|
||||
return regionID(altRegionIDs[i/3]), nil
|
||||
}
|
||||
}
|
||||
return 0, mkErrInvalid(s)
|
||||
}
|
||||
return 0, errSyntax
|
||||
}
|
||||
|
||||
func getRegionM49(n int) (regionID, error) {
|
||||
if 0 < n && n <= 999 {
|
||||
const (
|
||||
searchBits = 7
|
||||
regionBits = 9
|
||||
regionMask = 1<<regionBits - 1
|
||||
)
|
||||
idx := n >> searchBits
|
||||
buf := fromM49[m49Index[idx]:m49Index[idx+1]]
|
||||
val := uint16(n) << regionBits // we rely on bits shifting out
|
||||
i := sort.Search(len(buf), func(i int) bool {
|
||||
return buf[i] >= val
|
||||
})
|
||||
if r := fromM49[int(m49Index[idx])+i]; r&^regionMask == val {
|
||||
return regionID(r & regionMask), nil
|
||||
}
|
||||
}
|
||||
var e ValueError
|
||||
fmt.Fprint(bytes.NewBuffer([]byte(e.v[:])), n)
|
||||
return 0, e
|
||||
}
|
||||
|
||||
// normRegion returns a region if r is deprecated or 0 otherwise.
|
||||
// TODO: consider supporting BYS (-> BLR), CSK (-> 200 or CZ), PHI (-> PHL) and AFI (-> DJ).
|
||||
// TODO: consider mapping split up regions to new most populous one (like CLDR).
|
||||
func normRegion(r regionID) regionID {
|
||||
m := regionOldMap
|
||||
k := sort.Search(len(m), func(i int) bool {
|
||||
return m[i].from >= uint16(r)
|
||||
})
|
||||
if k < len(m) && m[k].from == uint16(r) {
|
||||
return regionID(m[k].to)
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
const (
|
||||
iso3166UserAssigned = 1 << iota
|
||||
ccTLD
|
||||
bcp47Region
|
||||
)
|
||||
|
||||
func (r regionID) typ() byte {
|
||||
return regionTypes[r]
|
||||
}
|
||||
|
||||
// String returns the BCP 47 representation for the region.
|
||||
// It returns "ZZ" for an unspecified region.
|
||||
func (r regionID) String() string {
|
||||
if r < isoRegionOffset {
|
||||
if r == 0 {
|
||||
return "ZZ"
|
||||
}
|
||||
return fmt.Sprintf("%03d", r.M49())
|
||||
}
|
||||
r -= isoRegionOffset
|
||||
return regionISO.Elem(int(r))[:2]
|
||||
}
|
||||
|
||||
// ISO3 returns the 3-letter ISO code of r.
|
||||
// Note that not all regions have a 3-letter ISO code.
|
||||
// In such cases this method returns "ZZZ".
|
||||
func (r regionID) ISO3() string {
|
||||
if r < isoRegionOffset {
|
||||
return "ZZZ"
|
||||
}
|
||||
r -= isoRegionOffset
|
||||
reg := regionISO.Elem(int(r))
|
||||
switch reg[2] {
|
||||
case 0:
|
||||
return altRegionISO3[reg[3]:][:3]
|
||||
case ' ':
|
||||
return "ZZZ"
|
||||
}
|
||||
return reg[0:1] + reg[2:4]
|
||||
}
|
||||
|
||||
// M49 returns the UN M.49 encoding of r, or 0 if this encoding
|
||||
// is not defined for r.
|
||||
func (r regionID) M49() int {
|
||||
return int(m49[r])
|
||||
}
|
||||
|
||||
// IsPrivateUse reports whether r has the ISO 3166 User-assigned status. This
|
||||
// may include private-use tags that are assigned by CLDR and used in this
|
||||
// implementation. So IsPrivateUse and IsCountry can be simultaneously true.
|
||||
func (r regionID) IsPrivateUse() bool {
|
||||
return r.typ()&iso3166UserAssigned != 0
|
||||
}
|
||||
|
||||
type scriptID uint8
|
||||
|
||||
// getScriptID returns the script id for string s. It assumes that s
|
||||
// is of the format [A-Z][a-z]{3}.
|
||||
func getScriptID(idx tag.Index, s []byte) (scriptID, error) {
|
||||
i, err := findIndex(idx, s, "Zzzz")
|
||||
return scriptID(i), err
|
||||
}
|
||||
|
||||
// String returns the script code in title case.
|
||||
// It returns "Zzzz" for an unspecified script.
|
||||
func (s scriptID) String() string {
|
||||
if s == 0 {
|
||||
return "Zzzz"
|
||||
}
|
||||
return script.Elem(int(s))
|
||||
}
|
||||
|
||||
// IsPrivateUse reports whether this script code is reserved for private use.
|
||||
func (s scriptID) IsPrivateUse() bool {
|
||||
return _Qaaa <= s && s <= _Qabx
|
||||
}
|
||||
|
||||
const (
|
||||
maxAltTaglen = len("en-US-POSIX")
|
||||
maxLen = maxAltTaglen
|
||||
)
|
||||
|
||||
var (
|
||||
// grandfatheredMap holds a mapping from legacy and grandfathered tags to
|
||||
// their base language or index to more elaborate tag.
|
||||
grandfatheredMap = map[[maxLen]byte]int16{
|
||||
[maxLen]byte{'a', 'r', 't', '-', 'l', 'o', 'j', 'b', 'a', 'n'}: _jbo, // art-lojban
|
||||
[maxLen]byte{'i', '-', 'a', 'm', 'i'}: _ami, // i-ami
|
||||
[maxLen]byte{'i', '-', 'b', 'n', 'n'}: _bnn, // i-bnn
|
||||
[maxLen]byte{'i', '-', 'h', 'a', 'k'}: _hak, // i-hak
|
||||
[maxLen]byte{'i', '-', 'k', 'l', 'i', 'n', 'g', 'o', 'n'}: _tlh, // i-klingon
|
||||
[maxLen]byte{'i', '-', 'l', 'u', 'x'}: _lb, // i-lux
|
||||
[maxLen]byte{'i', '-', 'n', 'a', 'v', 'a', 'j', 'o'}: _nv, // i-navajo
|
||||
[maxLen]byte{'i', '-', 'p', 'w', 'n'}: _pwn, // i-pwn
|
||||
[maxLen]byte{'i', '-', 't', 'a', 'o'}: _tao, // i-tao
|
||||
[maxLen]byte{'i', '-', 't', 'a', 'y'}: _tay, // i-tay
|
||||
[maxLen]byte{'i', '-', 't', 's', 'u'}: _tsu, // i-tsu
|
||||
[maxLen]byte{'n', 'o', '-', 'b', 'o', 'k'}: _nb, // no-bok
|
||||
[maxLen]byte{'n', 'o', '-', 'n', 'y', 'n'}: _nn, // no-nyn
|
||||
[maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'f', 'r'}: _sfb, // sgn-BE-FR
|
||||
[maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'n', 'l'}: _vgt, // sgn-BE-NL
|
||||
[maxLen]byte{'s', 'g', 'n', '-', 'c', 'h', '-', 'd', 'e'}: _sgg, // sgn-CH-DE
|
||||
[maxLen]byte{'z', 'h', '-', 'g', 'u', 'o', 'y', 'u'}: _cmn, // zh-guoyu
|
||||
[maxLen]byte{'z', 'h', '-', 'h', 'a', 'k', 'k', 'a'}: _hak, // zh-hakka
|
||||
[maxLen]byte{'z', 'h', '-', 'm', 'i', 'n', '-', 'n', 'a', 'n'}: _nan, // zh-min-nan
|
||||
[maxLen]byte{'z', 'h', '-', 'x', 'i', 'a', 'n', 'g'}: _hsn, // zh-xiang
|
||||
|
||||
// Grandfathered tags with no modern replacement will be converted as
|
||||
// follows:
|
||||
[maxLen]byte{'c', 'e', 'l', '-', 'g', 'a', 'u', 'l', 'i', 's', 'h'}: -1, // cel-gaulish
|
||||
[maxLen]byte{'e', 'n', '-', 'g', 'b', '-', 'o', 'e', 'd'}: -2, // en-GB-oed
|
||||
[maxLen]byte{'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'}: -3, // i-default
|
||||
[maxLen]byte{'i', '-', 'e', 'n', 'o', 'c', 'h', 'i', 'a', 'n'}: -4, // i-enochian
|
||||
[maxLen]byte{'i', '-', 'm', 'i', 'n', 'g', 'o'}: -5, // i-mingo
|
||||
[maxLen]byte{'z', 'h', '-', 'm', 'i', 'n'}: -6, // zh-min
|
||||
|
||||
// CLDR-specific tag.
|
||||
[maxLen]byte{'r', 'o', 'o', 't'}: 0, // root
|
||||
[maxLen]byte{'e', 'n', '-', 'u', 's', '-', 'p', 'o', 's', 'i', 'x'}: -7, // en_US_POSIX"
|
||||
}
|
||||
|
||||
altTagIndex = [...]uint8{0, 17, 31, 45, 61, 74, 86, 102}
|
||||
|
||||
altTags = "xtg-x-cel-gaulishen-GB-oxendicten-x-i-defaultund-x-i-enochiansee-x-i-mingonan-x-zh-minen-US-u-va-posix"
|
||||
)
|
||||
|
||||
func grandfathered(s [maxAltTaglen]byte) (t Tag, ok bool) {
|
||||
if v, ok := grandfatheredMap[s]; ok {
|
||||
if v < 0 {
|
||||
return Make(altTags[altTagIndex[-v-1]:altTagIndex[-v]]), true
|
||||
}
|
||||
t.lang = langID(v)
|
||||
return t, true
|
||||
}
|
||||
return t, false
|
||||
}
|
||||
+1648
File diff suppressed because it is too large
Load Diff
+841
@@ -0,0 +1,841 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package language
|
||||
|
||||
import "errors"
|
||||
|
||||
// Matcher is the interface that wraps the Match method.
|
||||
//
|
||||
// Match returns the best match for any of the given tags, along with
|
||||
// a unique index associated with the returned tag and a confidence
|
||||
// score.
|
||||
type Matcher interface {
|
||||
Match(t ...Tag) (tag Tag, index int, c Confidence)
|
||||
}
|
||||
|
||||
// Comprehends reports the confidence score for a speaker of a given language
|
||||
// to being able to comprehend the written form of an alternative language.
|
||||
func Comprehends(speaker, alternative Tag) Confidence {
|
||||
_, _, c := NewMatcher([]Tag{alternative}).Match(speaker)
|
||||
return c
|
||||
}
|
||||
|
||||
// NewMatcher returns a Matcher that matches an ordered list of preferred tags
|
||||
// against a list of supported tags based on written intelligibility, closeness
|
||||
// of dialect, equivalence of subtags and various other rules. It is initialized
|
||||
// with the list of supported tags. The first element is used as the default
|
||||
// value in case no match is found.
|
||||
//
|
||||
// Its Match method matches the first of the given Tags to reach a certain
|
||||
// confidence threshold. The tags passed to Match should therefore be specified
|
||||
// in order of preference. Extensions are ignored for matching.
|
||||
//
|
||||
// The index returned by the Match method corresponds to the index of the
|
||||
// matched tag in t, but is augmented with the Unicode extension ('u')of the
|
||||
// corresponding preferred tag. This allows user locale options to be passed
|
||||
// transparently.
|
||||
func NewMatcher(t []Tag) Matcher {
|
||||
return newMatcher(t)
|
||||
}
|
||||
|
||||
func (m *matcher) Match(want ...Tag) (t Tag, index int, c Confidence) {
|
||||
match, w, c := m.getBest(want...)
|
||||
if match == nil {
|
||||
t = m.default_.tag
|
||||
} else {
|
||||
t, index = match.tag, match.index
|
||||
}
|
||||
// Copy options from the user-provided tag into the result tag. This is hard
|
||||
// to do after the fact, so we do it here.
|
||||
// TODO: consider also adding in variants that are compatible with the
|
||||
// matched language.
|
||||
// TODO: Add back region if it is non-ambiguous? Or create another tag to
|
||||
// preserve the region?
|
||||
if u, ok := w.Extension('u'); ok {
|
||||
t, _ = Raw.Compose(t, u)
|
||||
}
|
||||
return t, index, c
|
||||
}
|
||||
|
||||
type scriptRegionFlags uint8
|
||||
|
||||
const (
|
||||
isList = 1 << iota
|
||||
scriptInFrom
|
||||
regionInFrom
|
||||
)
|
||||
|
||||
func (t *Tag) setUndefinedLang(id langID) {
|
||||
if t.lang == 0 {
|
||||
t.lang = id
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tag) setUndefinedScript(id scriptID) {
|
||||
if t.script == 0 {
|
||||
t.script = id
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tag) setUndefinedRegion(id regionID) {
|
||||
if t.region == 0 || t.region.contains(id) {
|
||||
t.region = id
|
||||
}
|
||||
}
|
||||
|
||||
// ErrMissingLikelyTagsData indicates no information was available
|
||||
// to compute likely values of missing tags.
|
||||
var ErrMissingLikelyTagsData = errors.New("missing likely tags data")
|
||||
|
||||
// addLikelySubtags sets subtags to their most likely value, given the locale.
|
||||
// In most cases this means setting fields for unknown values, but in some
|
||||
// cases it may alter a value. It returns a ErrMissingLikelyTagsData error
|
||||
// if the given locale cannot be expanded.
|
||||
func (t Tag) addLikelySubtags() (Tag, error) {
|
||||
id, err := addTags(t)
|
||||
if err != nil {
|
||||
return t, err
|
||||
} else if id.equalTags(t) {
|
||||
return t, nil
|
||||
}
|
||||
id.remakeString()
|
||||
return id, nil
|
||||
}
|
||||
|
||||
// specializeRegion attempts to specialize a group region.
|
||||
func specializeRegion(t *Tag) bool {
|
||||
if i := regionInclusion[t.region]; i < nRegionGroups {
|
||||
x := likelyRegionGroup[i]
|
||||
if langID(x.lang) == t.lang && scriptID(x.script) == t.script {
|
||||
t.region = regionID(x.region)
|
||||
}
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func addTags(t Tag) (Tag, error) {
|
||||
// We leave private use identifiers alone.
|
||||
if t.private() {
|
||||
return t, nil
|
||||
}
|
||||
if t.script != 0 && t.region != 0 {
|
||||
if t.lang != 0 {
|
||||
// already fully specified
|
||||
specializeRegion(&t)
|
||||
return t, nil
|
||||
}
|
||||
// Search matches for und-script-region. Note that for these cases
|
||||
// region will never be a group so there is no need to check for this.
|
||||
list := likelyRegion[t.region : t.region+1]
|
||||
if x := list[0]; x.flags&isList != 0 {
|
||||
list = likelyRegionList[x.lang : x.lang+uint16(x.script)]
|
||||
}
|
||||
for _, x := range list {
|
||||
// Deviating from the spec. See match_test.go for details.
|
||||
if scriptID(x.script) == t.script {
|
||||
t.setUndefinedLang(langID(x.lang))
|
||||
return t, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
if t.lang != 0 {
|
||||
// Search matches for lang-script and lang-region, where lang != und.
|
||||
if t.lang < langNoIndexOffset {
|
||||
x := likelyLang[t.lang]
|
||||
if x.flags&isList != 0 {
|
||||
list := likelyLangList[x.region : x.region+uint16(x.script)]
|
||||
if t.script != 0 {
|
||||
for _, x := range list {
|
||||
if scriptID(x.script) == t.script && x.flags&scriptInFrom != 0 {
|
||||
t.setUndefinedRegion(regionID(x.region))
|
||||
return t, nil
|
||||
}
|
||||
}
|
||||
} else if t.region != 0 {
|
||||
count := 0
|
||||
goodScript := true
|
||||
tt := t
|
||||
for _, x := range list {
|
||||
// We visit all entries for which the script was not
|
||||
// defined, including the ones where the region was not
|
||||
// defined. This allows for proper disambiguation within
|
||||
// regions.
|
||||
if x.flags&scriptInFrom == 0 && t.region.contains(regionID(x.region)) {
|
||||
tt.region = regionID(x.region)
|
||||
tt.setUndefinedScript(scriptID(x.script))
|
||||
goodScript = goodScript && tt.script == scriptID(x.script)
|
||||
count++
|
||||
}
|
||||
}
|
||||
if count == 1 {
|
||||
return tt, nil
|
||||
}
|
||||
// Even if we fail to find a unique Region, we might have
|
||||
// an unambiguous script.
|
||||
if goodScript {
|
||||
t.script = tt.script
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Search matches for und-script.
|
||||
if t.script != 0 {
|
||||
x := likelyScript[t.script]
|
||||
if x.region != 0 {
|
||||
t.setUndefinedRegion(regionID(x.region))
|
||||
t.setUndefinedLang(langID(x.lang))
|
||||
return t, nil
|
||||
}
|
||||
}
|
||||
// Search matches for und-region. If und-script-region exists, it would
|
||||
// have been found earlier.
|
||||
if t.region != 0 {
|
||||
if i := regionInclusion[t.region]; i < nRegionGroups {
|
||||
x := likelyRegionGroup[i]
|
||||
if x.region != 0 {
|
||||
t.setUndefinedLang(langID(x.lang))
|
||||
t.setUndefinedScript(scriptID(x.script))
|
||||
t.region = regionID(x.region)
|
||||
}
|
||||
} else {
|
||||
x := likelyRegion[t.region]
|
||||
if x.flags&isList != 0 {
|
||||
x = likelyRegionList[x.lang]
|
||||
}
|
||||
if x.script != 0 && x.flags != scriptInFrom {
|
||||
t.setUndefinedLang(langID(x.lang))
|
||||
t.setUndefinedScript(scriptID(x.script))
|
||||
return t, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Search matches for lang.
|
||||
if t.lang < langNoIndexOffset {
|
||||
x := likelyLang[t.lang]
|
||||
if x.flags&isList != 0 {
|
||||
x = likelyLangList[x.region]
|
||||
}
|
||||
if x.region != 0 {
|
||||
t.setUndefinedScript(scriptID(x.script))
|
||||
t.setUndefinedRegion(regionID(x.region))
|
||||
}
|
||||
specializeRegion(&t)
|
||||
if t.lang == 0 {
|
||||
t.lang = _en // default language
|
||||
}
|
||||
return t, nil
|
||||
}
|
||||
return t, ErrMissingLikelyTagsData
|
||||
}
|
||||
|
||||
func (t *Tag) setTagsFrom(id Tag) {
|
||||
t.lang = id.lang
|
||||
t.script = id.script
|
||||
t.region = id.region
|
||||
}
|
||||
|
||||
// minimize removes the region or script subtags from t such that
|
||||
// t.addLikelySubtags() == t.minimize().addLikelySubtags().
|
||||
func (t Tag) minimize() (Tag, error) {
|
||||
t, err := minimizeTags(t)
|
||||
if err != nil {
|
||||
return t, err
|
||||
}
|
||||
t.remakeString()
|
||||
return t, nil
|
||||
}
|
||||
|
||||
// minimizeTags mimics the behavior of the ICU 51 C implementation.
|
||||
func minimizeTags(t Tag) (Tag, error) {
|
||||
if t.equalTags(und) {
|
||||
return t, nil
|
||||
}
|
||||
max, err := addTags(t)
|
||||
if err != nil {
|
||||
return t, err
|
||||
}
|
||||
for _, id := range [...]Tag{
|
||||
{lang: t.lang},
|
||||
{lang: t.lang, region: t.region},
|
||||
{lang: t.lang, script: t.script},
|
||||
} {
|
||||
if x, err := addTags(id); err == nil && max.equalTags(x) {
|
||||
t.setTagsFrom(id)
|
||||
break
|
||||
}
|
||||
}
|
||||
return t, nil
|
||||
}
|
||||
|
||||
// Tag Matching
|
||||
// CLDR defines an algorithm for finding the best match between two sets of language
|
||||
// tags. The basic algorithm defines how to score a possible match and then find
|
||||
// the match with the best score
|
||||
// (see http://www.unicode.org/reports/tr35/#LanguageMatching).
|
||||
// Using scoring has several disadvantages. The scoring obfuscates the importance of
|
||||
// the various factors considered, making the algorithm harder to understand. Using
|
||||
// scoring also requires the full score to be computed for each pair of tags.
|
||||
//
|
||||
// We will use a different algorithm which aims to have the following properties:
|
||||
// - clarity on the precedence of the various selection factors, and
|
||||
// - improved performance by allowing early termination of a comparison.
|
||||
//
|
||||
// Matching algorithm (overview)
|
||||
// Input:
|
||||
// - supported: a set of supported tags
|
||||
// - default: the default tag to return in case there is no match
|
||||
// - desired: list of desired tags, ordered by preference, starting with
|
||||
// the most-preferred.
|
||||
//
|
||||
// Algorithm:
|
||||
// 1) Set the best match to the lowest confidence level
|
||||
// 2) For each tag in "desired":
|
||||
// a) For each tag in "supported":
|
||||
// 1) compute the match between the two tags.
|
||||
// 2) if the match is better than the previous best match, replace it
|
||||
// with the new match. (see next section)
|
||||
// b) if the current best match is above a certain threshold, return this
|
||||
// match without proceeding to the next tag in "desired". [See Note 1]
|
||||
// 3) If the best match so far is below a certain threshold, return "default".
|
||||
//
|
||||
// Ranking:
|
||||
// We use two phases to determine whether one pair of tags are a better match
|
||||
// than another pair of tags. First, we determine a rough confidence level. If the
|
||||
// levels are different, the one with the highest confidence wins.
|
||||
// Second, if the rough confidence levels are identical, we use a set of tie-breaker
|
||||
// rules.
|
||||
//
|
||||
// The confidence level of matching a pair of tags is determined by finding the
|
||||
// lowest confidence level of any matches of the corresponding subtags (the
|
||||
// result is deemed as good as its weakest link).
|
||||
// We define the following levels:
|
||||
// Exact - An exact match of a subtag, before adding likely subtags.
|
||||
// MaxExact - An exact match of a subtag, after adding likely subtags.
|
||||
// [See Note 2].
|
||||
// High - High level of mutual intelligibility between different subtag
|
||||
// variants.
|
||||
// Low - Low level of mutual intelligibility between different subtag
|
||||
// variants.
|
||||
// No - No mutual intelligibility.
|
||||
//
|
||||
// The following levels can occur for each type of subtag:
|
||||
// Base: Exact, MaxExact, High, Low, No
|
||||
// Script: Exact, MaxExact [see Note 3], Low, No
|
||||
// Region: Exact, MaxExact, High
|
||||
// Variant: Exact, High
|
||||
// Private: Exact, No
|
||||
//
|
||||
// Any result with a confidence level of Low or higher is deemed a possible match.
|
||||
// Once a desired tag matches any of the supported tags with a level of MaxExact
|
||||
// or higher, the next desired tag is not considered (see Step 2.b).
|
||||
// Note that CLDR provides languageMatching data that defines close equivalence
|
||||
// classes for base languages, scripts and regions.
|
||||
//
|
||||
// Tie-breaking
|
||||
// If we get the same confidence level for two matches, we apply a sequence of
|
||||
// tie-breaking rules. The first that succeeds defines the result. The rules are
|
||||
// applied in the following order.
|
||||
// 1) Original language was defined and was identical.
|
||||
// 2) Original region was defined and was identical.
|
||||
// 3) Distance between two maximized regions was the smallest.
|
||||
// 4) Original script was defined and was identical.
|
||||
// 5) Distance from want tag to have tag using the parent relation [see Note 5.]
|
||||
// If there is still no winner after these rules are applied, the first match
|
||||
// found wins.
|
||||
//
|
||||
// Notes:
|
||||
// [1] Note that even if we may not have a perfect match, if a match is above a
|
||||
// certain threshold, it is considered a better match than any other match
|
||||
// to a tag later in the list of preferred language tags.
|
||||
// [2] In practice, as matching of Exact is done in a separate phase from
|
||||
// matching the other levels, we reuse the Exact level to mean MaxExact in
|
||||
// the second phase. As a consequence, we only need the levels defined by
|
||||
// the Confidence type. The MaxExact confidence level is mapped to High in
|
||||
// the public API.
|
||||
// [3] We do not differentiate between maximized script values that were derived
|
||||
// from suppressScript versus most likely tag data. We determined that in
|
||||
// ranking the two, one ranks just after the other. Moreover, the two cannot
|
||||
// occur concurrently. As a consequence, they are identical for practical
|
||||
// purposes.
|
||||
// [4] In case of deprecated, macro-equivalents and legacy mappings, we assign
|
||||
// the MaxExact level to allow iw vs he to still be a closer match than
|
||||
// en-AU vs en-US, for example.
|
||||
// [5] In CLDR a locale inherits fields that are unspecified for this locale
|
||||
// from its parent. Therefore, if a locale is a parent of another locale,
|
||||
// it is a strong measure for closeness, especially when no other tie
|
||||
// breaker rule applies. One could also argue it is inconsistent, for
|
||||
// example, when pt-AO matches pt (which CLDR equates with pt-BR), even
|
||||
// though its parent is pt-PT according to the inheritance rules.
|
||||
//
|
||||
// Implementation Details:
|
||||
// There are several performance considerations worth pointing out. Most notably,
|
||||
// we preprocess as much as possible (within reason) at the time of creation of a
|
||||
// matcher. This includes:
|
||||
// - creating a per-language map, which includes data for the raw base language
|
||||
// and its canonicalized variant (if applicable),
|
||||
// - expanding entries for the equivalence classes defined in CLDR's
|
||||
// languageMatch data.
|
||||
// The per-language map ensures that typically only a very small number of tags
|
||||
// need to be considered. The pre-expansion of canonicalized subtags and
|
||||
// equivalence classes reduces the amount of map lookups that need to be done at
|
||||
// runtime.
|
||||
|
||||
// matcher keeps a set of supported language tags, indexed by language.
|
||||
type matcher struct {
|
||||
default_ *haveTag
|
||||
index map[langID]*matchHeader
|
||||
passSettings bool
|
||||
}
|
||||
|
||||
// matchHeader has the lists of tags for exact matches and matches based on
|
||||
// maximized and canonicalized tags for a given language.
|
||||
type matchHeader struct {
|
||||
exact []*haveTag
|
||||
max []*haveTag
|
||||
}
|
||||
|
||||
// haveTag holds a supported Tag and its maximized script and region. The maximized
|
||||
// or canonicalized language is not stored as it is not needed during matching.
|
||||
type haveTag struct {
|
||||
tag Tag
|
||||
|
||||
// index of this tag in the original list of supported tags.
|
||||
index int
|
||||
|
||||
// conf is the maximum confidence that can result from matching this haveTag.
|
||||
// When conf < Exact this means it was inserted after applying a CLDR equivalence rule.
|
||||
conf Confidence
|
||||
|
||||
// Maximized region and script.
|
||||
maxRegion regionID
|
||||
maxScript scriptID
|
||||
|
||||
// altScript may be checked as an alternative match to maxScript. If altScript
|
||||
// matches, the confidence level for this match is Low. Theoretically there
|
||||
// could be multiple alternative scripts. This does not occur in practice.
|
||||
altScript scriptID
|
||||
|
||||
// nextMax is the index of the next haveTag with the same maximized tags.
|
||||
nextMax uint16
|
||||
}
|
||||
|
||||
func makeHaveTag(tag Tag, index int) (haveTag, langID) {
|
||||
max := tag
|
||||
if tag.lang != 0 {
|
||||
max, _ = max.canonicalize(All)
|
||||
max, _ = addTags(max)
|
||||
max.remakeString()
|
||||
}
|
||||
return haveTag{tag, index, Exact, max.region, max.script, altScript(max.lang, max.script), 0}, max.lang
|
||||
}
|
||||
|
||||
// altScript returns an alternative script that may match the given script with
|
||||
// a low confidence. At the moment, the langMatch data allows for at most one
|
||||
// script to map to another and we rely on this to keep the code simple.
|
||||
func altScript(l langID, s scriptID) scriptID {
|
||||
for _, alt := range matchScript {
|
||||
if (alt.lang == 0 || langID(alt.lang) == l) && scriptID(alt.have) == s {
|
||||
return scriptID(alt.want)
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// addIfNew adds a haveTag to the list of tags only if it is a unique tag.
|
||||
// Tags that have the same maximized values are linked by index.
|
||||
func (h *matchHeader) addIfNew(n haveTag, exact bool) {
|
||||
// Don't add new exact matches.
|
||||
for _, v := range h.exact {
|
||||
if v.tag.equalsRest(n.tag) {
|
||||
return
|
||||
}
|
||||
}
|
||||
if exact {
|
||||
h.exact = append(h.exact, &n)
|
||||
}
|
||||
// Allow duplicate maximized tags, but create a linked list to allow quickly
|
||||
// comparing the equivalents and bail out.
|
||||
for i, v := range h.max {
|
||||
if v.maxScript == n.maxScript &&
|
||||
v.maxRegion == n.maxRegion &&
|
||||
v.tag.variantOrPrivateTagStr() == n.tag.variantOrPrivateTagStr() {
|
||||
for h.max[i].nextMax != 0 {
|
||||
i = int(h.max[i].nextMax)
|
||||
}
|
||||
h.max[i].nextMax = uint16(len(h.max))
|
||||
break
|
||||
}
|
||||
}
|
||||
h.max = append(h.max, &n)
|
||||
}
|
||||
|
||||
// header returns the matchHeader for the given language. It creates one if
|
||||
// it doesn't already exist.
|
||||
func (m *matcher) header(l langID) *matchHeader {
|
||||
if h := m.index[l]; h != nil {
|
||||
return h
|
||||
}
|
||||
h := &matchHeader{}
|
||||
m.index[l] = h
|
||||
return h
|
||||
}
|
||||
|
||||
// newMatcher builds an index for the given supported tags and returns it as
|
||||
// a matcher. It also expands the index by considering various equivalence classes
|
||||
// for a given tag.
|
||||
func newMatcher(supported []Tag) *matcher {
|
||||
m := &matcher{
|
||||
index: make(map[langID]*matchHeader),
|
||||
}
|
||||
if len(supported) == 0 {
|
||||
m.default_ = &haveTag{}
|
||||
return m
|
||||
}
|
||||
// Add supported languages to the index. Add exact matches first to give
|
||||
// them precedence.
|
||||
for i, tag := range supported {
|
||||
pair, _ := makeHaveTag(tag, i)
|
||||
m.header(tag.lang).addIfNew(pair, true)
|
||||
}
|
||||
m.default_ = m.header(supported[0].lang).exact[0]
|
||||
for i, tag := range supported {
|
||||
pair, max := makeHaveTag(tag, i)
|
||||
if max != tag.lang {
|
||||
m.header(max).addIfNew(pair, false)
|
||||
}
|
||||
}
|
||||
|
||||
// update is used to add indexes in the map for equivalent languages.
|
||||
// If force is true, the update will also apply to derived entries. To
|
||||
// avoid applying a "transitive closure", use false.
|
||||
update := func(want, have uint16, conf Confidence, force bool) {
|
||||
if hh := m.index[langID(have)]; hh != nil {
|
||||
if !force && len(hh.exact) == 0 {
|
||||
return
|
||||
}
|
||||
hw := m.header(langID(want))
|
||||
for _, ht := range hh.max {
|
||||
v := *ht
|
||||
if conf < v.conf {
|
||||
v.conf = conf
|
||||
}
|
||||
v.nextMax = 0 // this value needs to be recomputed
|
||||
if v.altScript != 0 {
|
||||
v.altScript = altScript(langID(want), v.maxScript)
|
||||
}
|
||||
hw.addIfNew(v, conf == Exact && len(hh.exact) > 0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add entries for languages with mutual intelligibility as defined by CLDR's
|
||||
// languageMatch data.
|
||||
for _, ml := range matchLang {
|
||||
update(ml.want, ml.have, Confidence(ml.conf), false)
|
||||
if !ml.oneway {
|
||||
update(ml.have, ml.want, Confidence(ml.conf), false)
|
||||
}
|
||||
}
|
||||
|
||||
// Add entries for possible canonicalizations. This is an optimization to
|
||||
// ensure that only one map lookup needs to be done at runtime per desired tag.
|
||||
// First we match deprecated equivalents. If they are perfect equivalents
|
||||
// (their canonicalization simply substitutes a different language code, but
|
||||
// nothing else), the match confidence is Exact, otherwise it is High.
|
||||
for i, lm := range langAliasMap {
|
||||
if lm.from == _sh {
|
||||
continue
|
||||
}
|
||||
|
||||
// If deprecated codes match and there is no fiddling with the script or
|
||||
// or region, we consider it an exact match.
|
||||
conf := Exact
|
||||
if langAliasTypes[i] != langMacro {
|
||||
if !isExactEquivalent(langID(lm.from)) {
|
||||
conf = High
|
||||
}
|
||||
update(lm.to, lm.from, conf, true)
|
||||
}
|
||||
update(lm.from, lm.to, conf, true)
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// getBest gets the best matching tag in m for any of the given tags, taking into
|
||||
// account the order of preference of the given tags.
|
||||
func (m *matcher) getBest(want ...Tag) (got *haveTag, orig Tag, c Confidence) {
|
||||
best := bestMatch{}
|
||||
for _, w := range want {
|
||||
var max Tag
|
||||
// Check for exact match first.
|
||||
h := m.index[w.lang]
|
||||
if w.lang != 0 {
|
||||
// Base language is defined.
|
||||
if h == nil {
|
||||
continue
|
||||
}
|
||||
for i := range h.exact {
|
||||
have := h.exact[i]
|
||||
if have.tag.equalsRest(w) {
|
||||
return have, w, Exact
|
||||
}
|
||||
}
|
||||
max, _ = w.canonicalize(Legacy | Deprecated)
|
||||
max, _ = addTags(max)
|
||||
} else {
|
||||
// Base language is not defined.
|
||||
if h != nil {
|
||||
for i := range h.exact {
|
||||
have := h.exact[i]
|
||||
if have.tag.equalsRest(w) {
|
||||
return have, w, Exact
|
||||
}
|
||||
}
|
||||
}
|
||||
if w.script == 0 && w.region == 0 {
|
||||
// We skip all tags matching und for approximate matching, including
|
||||
// private tags.
|
||||
continue
|
||||
}
|
||||
max, _ = addTags(w)
|
||||
if h = m.index[max.lang]; h == nil {
|
||||
continue
|
||||
}
|
||||
}
|
||||
// Check for match based on maximized tag.
|
||||
for i := range h.max {
|
||||
have := h.max[i]
|
||||
best.update(have, w, max.script, max.region)
|
||||
if best.conf == Exact {
|
||||
for have.nextMax != 0 {
|
||||
have = h.max[have.nextMax]
|
||||
best.update(have, w, max.script, max.region)
|
||||
}
|
||||
return best.have, best.want, High
|
||||
}
|
||||
}
|
||||
}
|
||||
if best.conf <= No {
|
||||
if len(want) != 0 {
|
||||
return nil, want[0], No
|
||||
}
|
||||
return nil, Tag{}, No
|
||||
}
|
||||
return best.have, best.want, best.conf
|
||||
}
|
||||
|
||||
// bestMatch accumulates the best match so far.
|
||||
type bestMatch struct {
|
||||
have *haveTag
|
||||
want Tag
|
||||
conf Confidence
|
||||
// Cached results from applying tie-breaking rules.
|
||||
origLang bool
|
||||
origReg bool
|
||||
regDist uint8
|
||||
origScript bool
|
||||
parentDist uint8 // 255 if have is not an ancestor of want tag.
|
||||
}
|
||||
|
||||
// update updates the existing best match if the new pair is considered to be a
|
||||
// better match.
|
||||
// To determine if the given pair is a better match, it first computes the rough
|
||||
// confidence level. If this surpasses the current match, it will replace it and
|
||||
// update the tie-breaker rule cache. If there is a tie, it proceeds with applying
|
||||
// a series of tie-breaker rules. If there is no conclusive winner after applying
|
||||
// the tie-breaker rules, it leaves the current match as the preferred match.
|
||||
func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion regionID) {
|
||||
// Bail if the maximum attainable confidence is below that of the current best match.
|
||||
c := have.conf
|
||||
if c < m.conf {
|
||||
return
|
||||
}
|
||||
if have.maxScript != maxScript {
|
||||
// There is usually very little comprehension between different scripts.
|
||||
// In a few cases there may still be Low comprehension. This possibility is
|
||||
// pre-computed and stored in have.altScript.
|
||||
if Low < m.conf || have.altScript != maxScript {
|
||||
return
|
||||
}
|
||||
c = Low
|
||||
} else if have.maxRegion != maxRegion {
|
||||
// There is usually a small difference between languages across regions.
|
||||
// We use the region distance (below) to disambiguate between equal matches.
|
||||
if High < c {
|
||||
c = High
|
||||
}
|
||||
}
|
||||
|
||||
// We store the results of the computations of the tie-breaker rules along
|
||||
// with the best match. There is no need to do the checks once we determine
|
||||
// we have a winner, but we do still need to do the tie-breaker computations.
|
||||
// We use "beaten" to keep track if we still need to do the checks.
|
||||
beaten := false // true if the new pair defeats the current one.
|
||||
if c != m.conf {
|
||||
if c < m.conf {
|
||||
return
|
||||
}
|
||||
beaten = true
|
||||
}
|
||||
|
||||
// Tie-breaker rules:
|
||||
// We prefer if the pre-maximized language was specified and identical.
|
||||
origLang := have.tag.lang == tag.lang && tag.lang != 0
|
||||
if !beaten && m.origLang != origLang {
|
||||
if m.origLang {
|
||||
return
|
||||
}
|
||||
beaten = true
|
||||
}
|
||||
|
||||
// We prefer if the pre-maximized region was specified and identical.
|
||||
origReg := have.tag.region == tag.region && tag.region != 0
|
||||
if !beaten && m.origReg != origReg {
|
||||
if m.origReg {
|
||||
return
|
||||
}
|
||||
beaten = true
|
||||
}
|
||||
|
||||
// Next we prefer smaller distances between regions, as defined by regionDist.
|
||||
regDist := regionDist(have.maxRegion, maxRegion, tag.lang)
|
||||
if !beaten && m.regDist != regDist {
|
||||
if regDist > m.regDist {
|
||||
return
|
||||
}
|
||||
beaten = true
|
||||
}
|
||||
|
||||
// Next we prefer if the pre-maximized script was specified and identical.
|
||||
origScript := have.tag.script == tag.script && tag.script != 0
|
||||
if !beaten && m.origScript != origScript {
|
||||
if m.origScript {
|
||||
return
|
||||
}
|
||||
beaten = true
|
||||
}
|
||||
|
||||
// Finally we prefer tags which have a closer parent relationship.
|
||||
parentDist := parentDistance(have.tag.region, tag)
|
||||
if !beaten && m.parentDist != parentDist {
|
||||
if parentDist > m.parentDist {
|
||||
return
|
||||
}
|
||||
beaten = true
|
||||
}
|
||||
|
||||
// Update m to the newly found best match.
|
||||
if beaten {
|
||||
m.have = have
|
||||
m.want = tag
|
||||
m.conf = c
|
||||
m.origLang = origLang
|
||||
m.origReg = origReg
|
||||
m.origScript = origScript
|
||||
m.regDist = regDist
|
||||
m.parentDist = parentDist
|
||||
}
|
||||
}
|
||||
|
||||
// parentDistance returns the number of times Parent must be called before the
|
||||
// regions match. It is assumed that it has already been checked that lang and
|
||||
// script are identical. If haveRegion does not occur in the ancestor chain of
|
||||
// tag, it returns 255.
|
||||
func parentDistance(haveRegion regionID, tag Tag) uint8 {
|
||||
p := tag.Parent()
|
||||
d := uint8(1)
|
||||
for haveRegion != p.region {
|
||||
if p.region == 0 {
|
||||
return 255
|
||||
}
|
||||
p = p.Parent()
|
||||
d++
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
// regionDist wraps regionDistance with some exceptions to the algorithmic distance.
|
||||
func regionDist(a, b regionID, lang langID) uint8 {
|
||||
if lang == _en {
|
||||
// Two variants of non-US English are close to each other, regardless of distance.
|
||||
if a != _US && b != _US {
|
||||
return 2
|
||||
}
|
||||
}
|
||||
return uint8(regionDistance(a, b))
|
||||
}
|
||||
|
||||
// regionDistance computes the distance between two regions based on the
|
||||
// distance in the graph of region containments as defined in CLDR. It iterates
|
||||
// over increasingly inclusive sets of groups, represented as bit vectors, until
|
||||
// the source bit vector has bits in common with the destination vector.
|
||||
func regionDistance(a, b regionID) int {
|
||||
if a == b {
|
||||
return 0
|
||||
}
|
||||
p, q := regionInclusion[a], regionInclusion[b]
|
||||
if p < nRegionGroups {
|
||||
p, q = q, p
|
||||
}
|
||||
set := regionInclusionBits
|
||||
if q < nRegionGroups && set[p]&(1<<q) != 0 {
|
||||
return 1
|
||||
}
|
||||
d := 2
|
||||
for goal := set[q]; set[p]&goal == 0; p = regionInclusionNext[p] {
|
||||
d++
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
func (t Tag) variants() string {
|
||||
if t.pVariant == 0 {
|
||||
return ""
|
||||
}
|
||||
return t.str[t.pVariant:t.pExt]
|
||||
}
|
||||
|
||||
// variantOrPrivateTagStr returns variants or private use tags.
|
||||
func (t Tag) variantOrPrivateTagStr() string {
|
||||
if t.pExt > 0 {
|
||||
return t.str[t.pVariant:t.pExt]
|
||||
}
|
||||
return t.str[t.pVariant:]
|
||||
}
|
||||
|
||||
// equalsRest compares everything except the language.
|
||||
func (a Tag) equalsRest(b Tag) bool {
|
||||
// TODO: don't include extensions in this comparison. To do this efficiently,
|
||||
// though, we should handle private tags separately.
|
||||
return a.script == b.script && a.region == b.region && a.variantOrPrivateTagStr() == b.variantOrPrivateTagStr()
|
||||
}
|
||||
|
||||
// isExactEquivalent returns true if canonicalizing the language will not alter
|
||||
// the script or region of a tag.
|
||||
func isExactEquivalent(l langID) bool {
|
||||
for _, o := range notEquivalent {
|
||||
if o == l {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
var notEquivalent []langID
|
||||
|
||||
func init() {
|
||||
// Create a list of all languages for which canonicalization may alter the
|
||||
// script or region.
|
||||
for _, lm := range langAliasMap {
|
||||
tag := Tag{lang: langID(lm.from)}
|
||||
if tag, _ = tag.canonicalize(All); tag.script != 0 || tag.region != 0 {
|
||||
notEquivalent = append(notEquivalent, langID(lm.from))
|
||||
}
|
||||
}
|
||||
}
|
||||
+859
@@ -0,0 +1,859 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package language
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/text/internal/tag"
|
||||
)
|
||||
|
||||
// isAlpha returns true if the byte is not a digit.
|
||||
// b must be an ASCII letter or digit.
|
||||
func isAlpha(b byte) bool {
|
||||
return b > '9'
|
||||
}
|
||||
|
||||
// isAlphaNum returns true if the string contains only ASCII letters or digits.
|
||||
func isAlphaNum(s []byte) bool {
|
||||
for _, c := range s {
|
||||
if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// errSyntax is returned by any of the parsing functions when the
|
||||
// input is not well-formed, according to BCP 47.
|
||||
// TODO: return the position at which the syntax error occurred?
|
||||
var errSyntax = errors.New("language: tag is not well-formed")
|
||||
|
||||
// ValueError is returned by any of the parsing functions when the
|
||||
// input is well-formed but the respective subtag is not recognized
|
||||
// as a valid value.
|
||||
type ValueError struct {
|
||||
v [8]byte
|
||||
}
|
||||
|
||||
func mkErrInvalid(s []byte) error {
|
||||
var e ValueError
|
||||
copy(e.v[:], s)
|
||||
return e
|
||||
}
|
||||
|
||||
func (e ValueError) tag() []byte {
|
||||
n := bytes.IndexByte(e.v[:], 0)
|
||||
if n == -1 {
|
||||
n = 8
|
||||
}
|
||||
return e.v[:n]
|
||||
}
|
||||
|
||||
// Error implements the error interface.
|
||||
func (e ValueError) Error() string {
|
||||
return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag())
|
||||
}
|
||||
|
||||
// Subtag returns the subtag for which the error occurred.
|
||||
func (e ValueError) Subtag() string {
|
||||
return string(e.tag())
|
||||
}
|
||||
|
||||
// scanner is used to scan BCP 47 tokens, which are separated by _ or -.
|
||||
type scanner struct {
|
||||
b []byte
|
||||
bytes [max99thPercentileSize]byte
|
||||
token []byte
|
||||
start int // start position of the current token
|
||||
end int // end position of the current token
|
||||
next int // next point for scan
|
||||
err error
|
||||
done bool
|
||||
}
|
||||
|
||||
func makeScannerString(s string) scanner {
|
||||
scan := scanner{}
|
||||
if len(s) <= len(scan.bytes) {
|
||||
scan.b = scan.bytes[:copy(scan.bytes[:], s)]
|
||||
} else {
|
||||
scan.b = []byte(s)
|
||||
}
|
||||
scan.init()
|
||||
return scan
|
||||
}
|
||||
|
||||
// makeScanner returns a scanner using b as the input buffer.
|
||||
// b is not copied and may be modified by the scanner routines.
|
||||
func makeScanner(b []byte) scanner {
|
||||
scan := scanner{b: b}
|
||||
scan.init()
|
||||
return scan
|
||||
}
|
||||
|
||||
func (s *scanner) init() {
|
||||
for i, c := range s.b {
|
||||
if c == '_' {
|
||||
s.b[i] = '-'
|
||||
}
|
||||
}
|
||||
s.scan()
|
||||
}
|
||||
|
||||
// restToLower converts the string between start and end to lower case.
|
||||
func (s *scanner) toLower(start, end int) {
|
||||
for i := start; i < end; i++ {
|
||||
c := s.b[i]
|
||||
if 'A' <= c && c <= 'Z' {
|
||||
s.b[i] += 'a' - 'A'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *scanner) setError(e error) {
|
||||
if s.err == nil || (e == errSyntax && s.err != errSyntax) {
|
||||
s.err = e
|
||||
}
|
||||
}
|
||||
|
||||
// resizeRange shrinks or grows the array at position oldStart such that
|
||||
// a new string of size newSize can fit between oldStart and oldEnd.
|
||||
// Sets the scan point to after the resized range.
|
||||
func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) {
|
||||
s.start = oldStart
|
||||
if end := oldStart + newSize; end != oldEnd {
|
||||
diff := end - oldEnd
|
||||
if end < cap(s.b) {
|
||||
b := make([]byte, len(s.b)+diff)
|
||||
copy(b, s.b[:oldStart])
|
||||
copy(b[end:], s.b[oldEnd:])
|
||||
s.b = b
|
||||
} else {
|
||||
s.b = append(s.b[end:], s.b[oldEnd:]...)
|
||||
}
|
||||
s.next = end + (s.next - s.end)
|
||||
s.end = end
|
||||
}
|
||||
}
|
||||
|
||||
// replace replaces the current token with repl.
|
||||
func (s *scanner) replace(repl string) {
|
||||
s.resizeRange(s.start, s.end, len(repl))
|
||||
copy(s.b[s.start:], repl)
|
||||
}
|
||||
|
||||
// gobble removes the current token from the input.
|
||||
// Caller must call scan after calling gobble.
|
||||
func (s *scanner) gobble(e error) {
|
||||
s.setError(e)
|
||||
if s.start == 0 {
|
||||
s.b = s.b[:+copy(s.b, s.b[s.next:])]
|
||||
s.end = 0
|
||||
} else {
|
||||
s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])]
|
||||
s.end = s.start - 1
|
||||
}
|
||||
s.next = s.start
|
||||
}
|
||||
|
||||
// deleteRange removes the given range from s.b before the current token.
|
||||
func (s *scanner) deleteRange(start, end int) {
|
||||
s.setError(errSyntax)
|
||||
s.b = s.b[:start+copy(s.b[start:], s.b[end:])]
|
||||
diff := end - start
|
||||
s.next -= diff
|
||||
s.start -= diff
|
||||
s.end -= diff
|
||||
}
|
||||
|
||||
// scan parses the next token of a BCP 47 string. Tokens that are larger
|
||||
// than 8 characters or include non-alphanumeric characters result in an error
|
||||
// and are gobbled and removed from the output.
|
||||
// It returns the end position of the last token consumed.
|
||||
func (s *scanner) scan() (end int) {
|
||||
end = s.end
|
||||
s.token = nil
|
||||
for s.start = s.next; s.next < len(s.b); {
|
||||
i := bytes.IndexByte(s.b[s.next:], '-')
|
||||
if i == -1 {
|
||||
s.end = len(s.b)
|
||||
s.next = len(s.b)
|
||||
i = s.end - s.start
|
||||
} else {
|
||||
s.end = s.next + i
|
||||
s.next = s.end + 1
|
||||
}
|
||||
token := s.b[s.start:s.end]
|
||||
if i < 1 || i > 8 || !isAlphaNum(token) {
|
||||
s.gobble(errSyntax)
|
||||
continue
|
||||
}
|
||||
s.token = token
|
||||
return end
|
||||
}
|
||||
if n := len(s.b); n > 0 && s.b[n-1] == '-' {
|
||||
s.setError(errSyntax)
|
||||
s.b = s.b[:len(s.b)-1]
|
||||
}
|
||||
s.done = true
|
||||
return end
|
||||
}
|
||||
|
||||
// acceptMinSize parses multiple tokens of the given size or greater.
|
||||
// It returns the end position of the last token consumed.
|
||||
func (s *scanner) acceptMinSize(min int) (end int) {
|
||||
end = s.end
|
||||
s.scan()
|
||||
for ; len(s.token) >= min; s.scan() {
|
||||
end = s.end
|
||||
}
|
||||
return end
|
||||
}
|
||||
|
||||
// Parse parses the given BCP 47 string and returns a valid Tag. If parsing
|
||||
// failed it returns an error and any part of the tag that could be parsed.
|
||||
// If parsing succeeded but an unknown value was found, it returns
|
||||
// ValueError. The Tag returned in this case is just stripped of the unknown
|
||||
// value. All other values are preserved. It accepts tags in the BCP 47 format
|
||||
// and extensions to this standard defined in
|
||||
// http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
|
||||
// The resulting tag is canonicalized using the default canonicalization type.
|
||||
func Parse(s string) (t Tag, err error) {
|
||||
return Default.Parse(s)
|
||||
}
|
||||
|
||||
// Parse parses the given BCP 47 string and returns a valid Tag. If parsing
|
||||
// failed it returns an error and any part of the tag that could be parsed.
|
||||
// If parsing succeeded but an unknown value was found, it returns
|
||||
// ValueError. The Tag returned in this case is just stripped of the unknown
|
||||
// value. All other values are preserved. It accepts tags in the BCP 47 format
|
||||
// and extensions to this standard defined in
|
||||
// http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
|
||||
// The resulting tag is canonicalized using the the canonicalization type c.
|
||||
func (c CanonType) Parse(s string) (t Tag, err error) {
|
||||
// TODO: consider supporting old-style locale key-value pairs.
|
||||
if s == "" {
|
||||
return und, errSyntax
|
||||
}
|
||||
if len(s) <= maxAltTaglen {
|
||||
b := [maxAltTaglen]byte{}
|
||||
for i, c := range s {
|
||||
// Generating invalid UTF-8 is okay as it won't match.
|
||||
if 'A' <= c && c <= 'Z' {
|
||||
c += 'a' - 'A'
|
||||
} else if c == '_' {
|
||||
c = '-'
|
||||
}
|
||||
b[i] = byte(c)
|
||||
}
|
||||
if t, ok := grandfathered(b); ok {
|
||||
return t, nil
|
||||
}
|
||||
}
|
||||
scan := makeScannerString(s)
|
||||
t, err = parse(&scan, s)
|
||||
t, changed := t.canonicalize(c)
|
||||
if changed {
|
||||
t.remakeString()
|
||||
}
|
||||
return t, err
|
||||
}
|
||||
|
||||
func parse(scan *scanner, s string) (t Tag, err error) {
|
||||
t = und
|
||||
var end int
|
||||
if n := len(scan.token); n <= 1 {
|
||||
scan.toLower(0, len(scan.b))
|
||||
if n == 0 || scan.token[0] != 'x' {
|
||||
return t, errSyntax
|
||||
}
|
||||
end = parseExtensions(scan)
|
||||
} else if n >= 4 {
|
||||
return und, errSyntax
|
||||
} else { // the usual case
|
||||
t, end = parseTag(scan)
|
||||
if n := len(scan.token); n == 1 {
|
||||
t.pExt = uint16(end)
|
||||
end = parseExtensions(scan)
|
||||
} else if end < len(scan.b) {
|
||||
scan.setError(errSyntax)
|
||||
scan.b = scan.b[:end]
|
||||
}
|
||||
}
|
||||
if int(t.pVariant) < len(scan.b) {
|
||||
if end < len(s) {
|
||||
s = s[:end]
|
||||
}
|
||||
if len(s) > 0 && tag.Compare(s, scan.b) == 0 {
|
||||
t.str = s
|
||||
} else {
|
||||
t.str = string(scan.b)
|
||||
}
|
||||
} else {
|
||||
t.pVariant, t.pExt = 0, 0
|
||||
}
|
||||
return t, scan.err
|
||||
}
|
||||
|
||||
// parseTag parses language, script, region and variants.
|
||||
// It returns a Tag and the end position in the input that was parsed.
|
||||
func parseTag(scan *scanner) (t Tag, end int) {
|
||||
var e error
|
||||
// TODO: set an error if an unknown lang, script or region is encountered.
|
||||
t.lang, e = getLangID(scan.token)
|
||||
scan.setError(e)
|
||||
scan.replace(t.lang.String())
|
||||
langStart := scan.start
|
||||
end = scan.scan()
|
||||
for len(scan.token) == 3 && isAlpha(scan.token[0]) {
|
||||
// From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent
|
||||
// to a tag of the form <extlang>.
|
||||
lang, e := getLangID(scan.token)
|
||||
if lang != 0 {
|
||||
t.lang = lang
|
||||
copy(scan.b[langStart:], lang.String())
|
||||
scan.b[langStart+3] = '-'
|
||||
scan.start = langStart + 4
|
||||
}
|
||||
scan.gobble(e)
|
||||
end = scan.scan()
|
||||
}
|
||||
if len(scan.token) == 4 && isAlpha(scan.token[0]) {
|
||||
t.script, e = getScriptID(script, scan.token)
|
||||
if t.script == 0 {
|
||||
scan.gobble(e)
|
||||
}
|
||||
end = scan.scan()
|
||||
}
|
||||
if n := len(scan.token); n >= 2 && n <= 3 {
|
||||
t.region, e = getRegionID(scan.token)
|
||||
if t.region == 0 {
|
||||
scan.gobble(e)
|
||||
} else {
|
||||
scan.replace(t.region.String())
|
||||
}
|
||||
end = scan.scan()
|
||||
}
|
||||
scan.toLower(scan.start, len(scan.b))
|
||||
t.pVariant = byte(end)
|
||||
end = parseVariants(scan, end, t)
|
||||
t.pExt = uint16(end)
|
||||
return t, end
|
||||
}
|
||||
|
||||
var separator = []byte{'-'}
|
||||
|
||||
// parseVariants scans tokens as long as each token is a valid variant string.
|
||||
// Duplicate variants are removed.
|
||||
func parseVariants(scan *scanner, end int, t Tag) int {
|
||||
start := scan.start
|
||||
varIDBuf := [4]uint8{}
|
||||
variantBuf := [4][]byte{}
|
||||
varID := varIDBuf[:0]
|
||||
variant := variantBuf[:0]
|
||||
last := -1
|
||||
needSort := false
|
||||
for ; len(scan.token) >= 4; scan.scan() {
|
||||
// TODO: measure the impact of needing this conversion and redesign
|
||||
// the data structure if there is an issue.
|
||||
v, ok := variantIndex[string(scan.token)]
|
||||
if !ok {
|
||||
// unknown variant
|
||||
// TODO: allow user-defined variants?
|
||||
scan.gobble(mkErrInvalid(scan.token))
|
||||
continue
|
||||
}
|
||||
varID = append(varID, v)
|
||||
variant = append(variant, scan.token)
|
||||
if !needSort {
|
||||
if last < int(v) {
|
||||
last = int(v)
|
||||
} else {
|
||||
needSort = true
|
||||
// There is no legal combinations of more than 7 variants
|
||||
// (and this is by no means a useful sequence).
|
||||
const maxVariants = 8
|
||||
if len(varID) > maxVariants {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
end = scan.end
|
||||
}
|
||||
if needSort {
|
||||
sort.Sort(variantsSort{varID, variant})
|
||||
k, l := 0, -1
|
||||
for i, v := range varID {
|
||||
w := int(v)
|
||||
if l == w {
|
||||
// Remove duplicates.
|
||||
continue
|
||||
}
|
||||
varID[k] = varID[i]
|
||||
variant[k] = variant[i]
|
||||
k++
|
||||
l = w
|
||||
}
|
||||
if str := bytes.Join(variant[:k], separator); len(str) == 0 {
|
||||
end = start - 1
|
||||
} else {
|
||||
scan.resizeRange(start, end, len(str))
|
||||
copy(scan.b[scan.start:], str)
|
||||
end = scan.end
|
||||
}
|
||||
}
|
||||
return end
|
||||
}
|
||||
|
||||
type variantsSort struct {
|
||||
i []uint8
|
||||
v [][]byte
|
||||
}
|
||||
|
||||
func (s variantsSort) Len() int {
|
||||
return len(s.i)
|
||||
}
|
||||
|
||||
func (s variantsSort) Swap(i, j int) {
|
||||
s.i[i], s.i[j] = s.i[j], s.i[i]
|
||||
s.v[i], s.v[j] = s.v[j], s.v[i]
|
||||
}
|
||||
|
||||
func (s variantsSort) Less(i, j int) bool {
|
||||
return s.i[i] < s.i[j]
|
||||
}
|
||||
|
||||
type bytesSort [][]byte
|
||||
|
||||
func (b bytesSort) Len() int {
|
||||
return len(b)
|
||||
}
|
||||
|
||||
func (b bytesSort) Swap(i, j int) {
|
||||
b[i], b[j] = b[j], b[i]
|
||||
}
|
||||
|
||||
func (b bytesSort) Less(i, j int) bool {
|
||||
return bytes.Compare(b[i], b[j]) == -1
|
||||
}
|
||||
|
||||
// parseExtensions parses and normalizes the extensions in the buffer.
|
||||
// It returns the last position of scan.b that is part of any extension.
|
||||
// It also trims scan.b to remove excess parts accordingly.
|
||||
func parseExtensions(scan *scanner) int {
|
||||
start := scan.start
|
||||
exts := [][]byte{}
|
||||
private := []byte{}
|
||||
end := scan.end
|
||||
for len(scan.token) == 1 {
|
||||
extStart := scan.start
|
||||
ext := scan.token[0]
|
||||
end = parseExtension(scan)
|
||||
extension := scan.b[extStart:end]
|
||||
if len(extension) < 3 || (ext != 'x' && len(extension) < 4) {
|
||||
scan.setError(errSyntax)
|
||||
end = extStart
|
||||
continue
|
||||
} else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) {
|
||||
scan.b = scan.b[:end]
|
||||
return end
|
||||
} else if ext == 'x' {
|
||||
private = extension
|
||||
break
|
||||
}
|
||||
exts = append(exts, extension)
|
||||
}
|
||||
sort.Sort(bytesSort(exts))
|
||||
if len(private) > 0 {
|
||||
exts = append(exts, private)
|
||||
}
|
||||
scan.b = scan.b[:start]
|
||||
if len(exts) > 0 {
|
||||
scan.b = append(scan.b, bytes.Join(exts, separator)...)
|
||||
} else if start > 0 {
|
||||
// Strip trailing '-'.
|
||||
scan.b = scan.b[:start-1]
|
||||
}
|
||||
return end
|
||||
}
|
||||
|
||||
// parseExtension parses a single extension and returns the position of
|
||||
// the extension end.
|
||||
func parseExtension(scan *scanner) int {
|
||||
start, end := scan.start, scan.end
|
||||
switch scan.token[0] {
|
||||
case 'u':
|
||||
attrStart := end
|
||||
scan.scan()
|
||||
for last := []byte{}; len(scan.token) > 2; scan.scan() {
|
||||
if bytes.Compare(scan.token, last) != -1 {
|
||||
// Attributes are unsorted. Start over from scratch.
|
||||
p := attrStart + 1
|
||||
scan.next = p
|
||||
attrs := [][]byte{}
|
||||
for scan.scan(); len(scan.token) > 2; scan.scan() {
|
||||
attrs = append(attrs, scan.token)
|
||||
end = scan.end
|
||||
}
|
||||
sort.Sort(bytesSort(attrs))
|
||||
copy(scan.b[p:], bytes.Join(attrs, separator))
|
||||
break
|
||||
}
|
||||
last = scan.token
|
||||
end = scan.end
|
||||
}
|
||||
var last, key []byte
|
||||
for attrEnd := end; len(scan.token) == 2; last = key {
|
||||
key = scan.token
|
||||
keyEnd := scan.end
|
||||
end = scan.acceptMinSize(3)
|
||||
// TODO: check key value validity
|
||||
if keyEnd == end || bytes.Compare(key, last) != 1 {
|
||||
// We have an invalid key or the keys are not sorted.
|
||||
// Start scanning keys from scratch and reorder.
|
||||
p := attrEnd + 1
|
||||
scan.next = p
|
||||
keys := [][]byte{}
|
||||
for scan.scan(); len(scan.token) == 2; {
|
||||
keyStart, keyEnd := scan.start, scan.end
|
||||
end = scan.acceptMinSize(3)
|
||||
if keyEnd != end {
|
||||
keys = append(keys, scan.b[keyStart:end])
|
||||
} else {
|
||||
scan.setError(errSyntax)
|
||||
end = keyStart
|
||||
}
|
||||
}
|
||||
sort.Sort(bytesSort(keys))
|
||||
reordered := bytes.Join(keys, separator)
|
||||
if e := p + len(reordered); e < end {
|
||||
scan.deleteRange(e, end)
|
||||
end = e
|
||||
}
|
||||
copy(scan.b[p:], bytes.Join(keys, separator))
|
||||
break
|
||||
}
|
||||
}
|
||||
case 't':
|
||||
scan.scan()
|
||||
if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
|
||||
_, end = parseTag(scan)
|
||||
scan.toLower(start, end)
|
||||
}
|
||||
for len(scan.token) == 2 && !isAlpha(scan.token[1]) {
|
||||
end = scan.acceptMinSize(3)
|
||||
}
|
||||
case 'x':
|
||||
end = scan.acceptMinSize(1)
|
||||
default:
|
||||
end = scan.acceptMinSize(2)
|
||||
}
|
||||
return end
|
||||
}
|
||||
|
||||
// Compose creates a Tag from individual parts, which may be of type Tag, Base,
|
||||
// Script, Region, Variant, []Variant, Extension, []Extension or error. If a
|
||||
// Base, Script or Region or slice of type Variant or Extension is passed more
|
||||
// than once, the latter will overwrite the former. Variants and Extensions are
|
||||
// accumulated, but if two extensions of the same type are passed, the latter
|
||||
// will replace the former. A Tag overwrites all former values and typically
|
||||
// only makes sense as the first argument. The resulting tag is returned after
|
||||
// canonicalizing using the Default CanonType. If one or more errors are
|
||||
// encountered, one of the errors is returned.
|
||||
func Compose(part ...interface{}) (t Tag, err error) {
|
||||
return Default.Compose(part...)
|
||||
}
|
||||
|
||||
// Compose creates a Tag from individual parts, which may be of type Tag, Base,
|
||||
// Script, Region, Variant, []Variant, Extension, []Extension or error. If a
|
||||
// Base, Script or Region or slice of type Variant or Extension is passed more
|
||||
// than once, the latter will overwrite the former. Variants and Extensions are
|
||||
// accumulated, but if two extensions of the same type are passed, the latter
|
||||
// will replace the former. A Tag overwrites all former values and typically
|
||||
// only makes sense as the first argument. The resulting tag is returned after
|
||||
// canonicalizing using CanonType c. If one or more errors are encountered,
|
||||
// one of the errors is returned.
|
||||
func (c CanonType) Compose(part ...interface{}) (t Tag, err error) {
|
||||
var b builder
|
||||
if err = b.update(part...); err != nil {
|
||||
return und, err
|
||||
}
|
||||
t, _ = b.tag.canonicalize(c)
|
||||
|
||||
if len(b.ext) > 0 || len(b.variant) > 0 {
|
||||
sort.Sort(sortVariant(b.variant))
|
||||
sort.Strings(b.ext)
|
||||
if b.private != "" {
|
||||
b.ext = append(b.ext, b.private)
|
||||
}
|
||||
n := maxCoreSize + tokenLen(b.variant...) + tokenLen(b.ext...)
|
||||
buf := make([]byte, n)
|
||||
p := t.genCoreBytes(buf)
|
||||
t.pVariant = byte(p)
|
||||
p += appendTokens(buf[p:], b.variant...)
|
||||
t.pExt = uint16(p)
|
||||
p += appendTokens(buf[p:], b.ext...)
|
||||
t.str = string(buf[:p])
|
||||
} else if b.private != "" {
|
||||
t.str = b.private
|
||||
t.remakeString()
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
type builder struct {
|
||||
tag Tag
|
||||
|
||||
private string // the x extension
|
||||
ext []string
|
||||
variant []string
|
||||
|
||||
err error
|
||||
}
|
||||
|
||||
func (b *builder) addExt(e string) {
|
||||
if e == "" {
|
||||
} else if e[0] == 'x' {
|
||||
b.private = e
|
||||
} else {
|
||||
b.ext = append(b.ext, e)
|
||||
}
|
||||
}
|
||||
|
||||
var errInvalidArgument = errors.New("invalid Extension or Variant")
|
||||
|
||||
func (b *builder) update(part ...interface{}) (err error) {
|
||||
replace := func(l *[]string, s string, eq func(a, b string) bool) bool {
|
||||
if s == "" {
|
||||
b.err = errInvalidArgument
|
||||
return true
|
||||
}
|
||||
for i, v := range *l {
|
||||
if eq(v, s) {
|
||||
(*l)[i] = s
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
for _, x := range part {
|
||||
switch v := x.(type) {
|
||||
case Tag:
|
||||
b.tag.lang = v.lang
|
||||
b.tag.region = v.region
|
||||
b.tag.script = v.script
|
||||
if v.str != "" {
|
||||
b.variant = nil
|
||||
for x, s := "", v.str[v.pVariant:v.pExt]; s != ""; {
|
||||
x, s = nextToken(s)
|
||||
b.variant = append(b.variant, x)
|
||||
}
|
||||
b.ext, b.private = nil, ""
|
||||
for i, e := int(v.pExt), ""; i < len(v.str); {
|
||||
i, e = getExtension(v.str, i)
|
||||
b.addExt(e)
|
||||
}
|
||||
}
|
||||
case Base:
|
||||
b.tag.lang = v.langID
|
||||
case Script:
|
||||
b.tag.script = v.scriptID
|
||||
case Region:
|
||||
b.tag.region = v.regionID
|
||||
case Variant:
|
||||
if !replace(&b.variant, v.variant, func(a, b string) bool { return a == b }) {
|
||||
b.variant = append(b.variant, v.variant)
|
||||
}
|
||||
case Extension:
|
||||
if !replace(&b.ext, v.s, func(a, b string) bool { return a[0] == b[0] }) {
|
||||
b.addExt(v.s)
|
||||
}
|
||||
case []Variant:
|
||||
b.variant = nil
|
||||
for _, x := range v {
|
||||
b.update(x)
|
||||
}
|
||||
case []Extension:
|
||||
b.ext, b.private = nil, ""
|
||||
for _, e := range v {
|
||||
b.update(e)
|
||||
}
|
||||
// TODO: support parsing of raw strings based on morphology or just extensions?
|
||||
case error:
|
||||
err = v
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func tokenLen(token ...string) (n int) {
|
||||
for _, t := range token {
|
||||
n += len(t) + 1
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func appendTokens(b []byte, token ...string) int {
|
||||
p := 0
|
||||
for _, t := range token {
|
||||
b[p] = '-'
|
||||
copy(b[p+1:], t)
|
||||
p += 1 + len(t)
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
type sortVariant []string
|
||||
|
||||
func (s sortVariant) Len() int {
|
||||
return len(s)
|
||||
}
|
||||
|
||||
func (s sortVariant) Swap(i, j int) {
|
||||
s[j], s[i] = s[i], s[j]
|
||||
}
|
||||
|
||||
func (s sortVariant) Less(i, j int) bool {
|
||||
return variantIndex[s[i]] < variantIndex[s[j]]
|
||||
}
|
||||
|
||||
func findExt(list []string, x byte) int {
|
||||
for i, e := range list {
|
||||
if e[0] == x {
|
||||
return i
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// getExtension returns the name, body and end position of the extension.
|
||||
func getExtension(s string, p int) (end int, ext string) {
|
||||
if s[p] == '-' {
|
||||
p++
|
||||
}
|
||||
if s[p] == 'x' {
|
||||
return len(s), s[p:]
|
||||
}
|
||||
end = nextExtension(s, p)
|
||||
return end, s[p:end]
|
||||
}
|
||||
|
||||
// nextExtension finds the next extension within the string, searching
|
||||
// for the -<char>- pattern from position p.
|
||||
// In the fast majority of cases, language tags will have at most
|
||||
// one extension and extensions tend to be small.
|
||||
func nextExtension(s string, p int) int {
|
||||
for n := len(s) - 3; p < n; {
|
||||
if s[p] == '-' {
|
||||
if s[p+2] == '-' {
|
||||
return p
|
||||
}
|
||||
p += 3
|
||||
} else {
|
||||
p++
|
||||
}
|
||||
}
|
||||
return len(s)
|
||||
}
|
||||
|
||||
var errInvalidWeight = errors.New("ParseAcceptLanguage: invalid weight")
|
||||
|
||||
// ParseAcceptLanguage parses the contents of a Accept-Language header as
|
||||
// defined in http://www.ietf.org/rfc/rfc2616.txt and returns a list of Tags and
|
||||
// a list of corresponding quality weights. It is more permissive than RFC 2616
|
||||
// and may return non-nil slices even if the input is not valid.
|
||||
// The Tags will be sorted by highest weight first and then by first occurrence.
|
||||
// Tags with a weight of zero will be dropped. An error will be returned if the
|
||||
// input could not be parsed.
|
||||
func ParseAcceptLanguage(s string) (tag []Tag, q []float32, err error) {
|
||||
var entry string
|
||||
for s != "" {
|
||||
if entry, s = split(s, ','); entry == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
entry, weight := split(entry, ';')
|
||||
|
||||
// Scan the language.
|
||||
t, err := Parse(entry)
|
||||
if err != nil {
|
||||
id, ok := acceptFallback[entry]
|
||||
if !ok {
|
||||
return nil, nil, err
|
||||
}
|
||||
t = Tag{lang: id}
|
||||
}
|
||||
|
||||
// Scan the optional weight.
|
||||
w := 1.0
|
||||
if weight != "" {
|
||||
weight = consume(weight, 'q')
|
||||
weight = consume(weight, '=')
|
||||
// consume returns the empty string when a token could not be
|
||||
// consumed, resulting in an error for ParseFloat.
|
||||
if w, err = strconv.ParseFloat(weight, 32); err != nil {
|
||||
return nil, nil, errInvalidWeight
|
||||
}
|
||||
// Drop tags with a quality weight of 0.
|
||||
if w <= 0 {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
tag = append(tag, t)
|
||||
q = append(q, float32(w))
|
||||
}
|
||||
sortStable(&tagSort{tag, q})
|
||||
return tag, q, nil
|
||||
}
|
||||
|
||||
// consume removes a leading token c from s and returns the result or the empty
|
||||
// string if there is no such token.
|
||||
func consume(s string, c byte) string {
|
||||
if s == "" || s[0] != c {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(s[1:])
|
||||
}
|
||||
|
||||
func split(s string, c byte) (head, tail string) {
|
||||
if i := strings.IndexByte(s, c); i >= 0 {
|
||||
return strings.TrimSpace(s[:i]), strings.TrimSpace(s[i+1:])
|
||||
}
|
||||
return strings.TrimSpace(s), ""
|
||||
}
|
||||
|
||||
// Add hack mapping to deal with a small number of cases that that occur
|
||||
// in Accept-Language (with reasonable frequency).
|
||||
var acceptFallback = map[string]langID{
|
||||
"english": _en,
|
||||
"deutsch": _de,
|
||||
"italian": _it,
|
||||
"french": _fr,
|
||||
"*": _mul, // defined in the spec to match all languages.
|
||||
}
|
||||
|
||||
type tagSort struct {
|
||||
tag []Tag
|
||||
q []float32
|
||||
}
|
||||
|
||||
func (s *tagSort) Len() int {
|
||||
return len(s.q)
|
||||
}
|
||||
|
||||
func (s *tagSort) Less(i, j int) bool {
|
||||
return s.q[i] > s.q[j]
|
||||
}
|
||||
|
||||
func (s *tagSort) Swap(i, j int) {
|
||||
s.tag[i], s.tag[j] = s.tag[j], s.tag[i]
|
||||
s.q[i], s.q[j] = s.q[j], s.q[i]
|
||||
}
|
||||
+3547
File diff suppressed because it is too large
Load Diff
+143
@@ -0,0 +1,143 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package language
|
||||
|
||||
// TODO: Various sets of commonly use tags and regions.
|
||||
|
||||
// MustParse is like Parse, but panics if the given BCP 47 tag cannot be parsed.
|
||||
// It simplifies safe initialization of Tag values.
|
||||
func MustParse(s string) Tag {
|
||||
t, err := Parse(s)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return t
|
||||
}
|
||||
|
||||
// MustParse is like Parse, but panics if the given BCP 47 tag cannot be parsed.
|
||||
// It simplifies safe initialization of Tag values.
|
||||
func (c CanonType) MustParse(s string) Tag {
|
||||
t, err := c.Parse(s)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return t
|
||||
}
|
||||
|
||||
// MustParseBase is like ParseBase, but panics if the given base cannot be parsed.
|
||||
// It simplifies safe initialization of Base values.
|
||||
func MustParseBase(s string) Base {
|
||||
b, err := ParseBase(s)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
// MustParseScript is like ParseScript, but panics if the given script cannot be
|
||||
// parsed. It simplifies safe initialization of Script values.
|
||||
func MustParseScript(s string) Script {
|
||||
scr, err := ParseScript(s)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return scr
|
||||
}
|
||||
|
||||
// MustParseRegion is like ParseRegion, but panics if the given region cannot be
|
||||
// parsed. It simplifies safe initialization of Region values.
|
||||
func MustParseRegion(s string) Region {
|
||||
r, err := ParseRegion(s)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
var (
|
||||
und = Tag{}
|
||||
|
||||
Und Tag = Tag{}
|
||||
|
||||
Afrikaans Tag = Tag{lang: _af} // af
|
||||
Amharic Tag = Tag{lang: _am} // am
|
||||
Arabic Tag = Tag{lang: _ar} // ar
|
||||
ModernStandardArabic Tag = Tag{lang: _ar, region: _001} // ar-001
|
||||
Azerbaijani Tag = Tag{lang: _az} // az
|
||||
Bulgarian Tag = Tag{lang: _bg} // bg
|
||||
Bengali Tag = Tag{lang: _bn} // bn
|
||||
Catalan Tag = Tag{lang: _ca} // ca
|
||||
Czech Tag = Tag{lang: _cs} // cs
|
||||
Danish Tag = Tag{lang: _da} // da
|
||||
German Tag = Tag{lang: _de} // de
|
||||
Greek Tag = Tag{lang: _el} // el
|
||||
English Tag = Tag{lang: _en} // en
|
||||
AmericanEnglish Tag = Tag{lang: _en, region: _US} // en-US
|
||||
BritishEnglish Tag = Tag{lang: _en, region: _GB} // en-GB
|
||||
Spanish Tag = Tag{lang: _es} // es
|
||||
EuropeanSpanish Tag = Tag{lang: _es, region: _ES} // es-ES
|
||||
LatinAmericanSpanish Tag = Tag{lang: _es, region: _419} // es-419
|
||||
Estonian Tag = Tag{lang: _et} // et
|
||||
Persian Tag = Tag{lang: _fa} // fa
|
||||
Finnish Tag = Tag{lang: _fi} // fi
|
||||
Filipino Tag = Tag{lang: _fil} // fil
|
||||
French Tag = Tag{lang: _fr} // fr
|
||||
CanadianFrench Tag = Tag{lang: _fr, region: _CA} // fr-CA
|
||||
Gujarati Tag = Tag{lang: _gu} // gu
|
||||
Hebrew Tag = Tag{lang: _he} // he
|
||||
Hindi Tag = Tag{lang: _hi} // hi
|
||||
Croatian Tag = Tag{lang: _hr} // hr
|
||||
Hungarian Tag = Tag{lang: _hu} // hu
|
||||
Armenian Tag = Tag{lang: _hy} // hy
|
||||
Indonesian Tag = Tag{lang: _id} // id
|
||||
Icelandic Tag = Tag{lang: _is} // is
|
||||
Italian Tag = Tag{lang: _it} // it
|
||||
Japanese Tag = Tag{lang: _ja} // ja
|
||||
Georgian Tag = Tag{lang: _ka} // ka
|
||||
Kazakh Tag = Tag{lang: _kk} // kk
|
||||
Khmer Tag = Tag{lang: _km} // km
|
||||
Kannada Tag = Tag{lang: _kn} // kn
|
||||
Korean Tag = Tag{lang: _ko} // ko
|
||||
Kirghiz Tag = Tag{lang: _ky} // ky
|
||||
Lao Tag = Tag{lang: _lo} // lo
|
||||
Lithuanian Tag = Tag{lang: _lt} // lt
|
||||
Latvian Tag = Tag{lang: _lv} // lv
|
||||
Macedonian Tag = Tag{lang: _mk} // mk
|
||||
Malayalam Tag = Tag{lang: _ml} // ml
|
||||
Mongolian Tag = Tag{lang: _mn} // mn
|
||||
Marathi Tag = Tag{lang: _mr} // mr
|
||||
Malay Tag = Tag{lang: _ms} // ms
|
||||
Burmese Tag = Tag{lang: _my} // my
|
||||
Nepali Tag = Tag{lang: _ne} // ne
|
||||
Dutch Tag = Tag{lang: _nl} // nl
|
||||
Norwegian Tag = Tag{lang: _no} // no
|
||||
Punjabi Tag = Tag{lang: _pa} // pa
|
||||
Polish Tag = Tag{lang: _pl} // pl
|
||||
Portuguese Tag = Tag{lang: _pt} // pt
|
||||
BrazilianPortuguese Tag = Tag{lang: _pt, region: _BR} // pt-BR
|
||||
EuropeanPortuguese Tag = Tag{lang: _pt, region: _PT} // pt-PT
|
||||
Romanian Tag = Tag{lang: _ro} // ro
|
||||
Russian Tag = Tag{lang: _ru} // ru
|
||||
Sinhala Tag = Tag{lang: _si} // si
|
||||
Slovak Tag = Tag{lang: _sk} // sk
|
||||
Slovenian Tag = Tag{lang: _sl} // sl
|
||||
Albanian Tag = Tag{lang: _sq} // sq
|
||||
Serbian Tag = Tag{lang: _sr} // sr
|
||||
SerbianLatin Tag = Tag{lang: _sr, script: _Latn} // sr-Latn
|
||||
Swedish Tag = Tag{lang: _sv} // sv
|
||||
Swahili Tag = Tag{lang: _sw} // sw
|
||||
Tamil Tag = Tag{lang: _ta} // ta
|
||||
Telugu Tag = Tag{lang: _te} // te
|
||||
Thai Tag = Tag{lang: _th} // th
|
||||
Turkish Tag = Tag{lang: _tr} // tr
|
||||
Ukrainian Tag = Tag{lang: _uk} // uk
|
||||
Urdu Tag = Tag{lang: _ur} // ur
|
||||
Uzbek Tag = Tag{lang: _uz} // uz
|
||||
Vietnamese Tag = Tag{lang: _vi} // vi
|
||||
Chinese Tag = Tag{lang: _zh} // zh
|
||||
SimplifiedChinese Tag = Tag{lang: _zh, script: _Hans} // zh-Hans
|
||||
TraditionalChinese Tag = Tag{lang: _zh, script: _Hant} // zh-Hant
|
||||
Zulu Tag = Tag{lang: _zu} // zu
|
||||
)
|
||||
+27
@@ -0,0 +1,27 @@
|
||||
Copyright (c) 2009 The Go Authors. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Google Inc. nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
+187
@@ -0,0 +1,187 @@
|
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package runes
|
||||
|
||||
import (
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// Note: below we pass invalid UTF-8 to the tIn and tNotIn transformers as is.
|
||||
// This is done for various reasons:
|
||||
// - To retain the semantics of the Nop transformer: if input is passed to a Nop
|
||||
// one would expect it to be unchanged.
|
||||
// - It would be very expensive to pass a converted RuneError to a transformer:
|
||||
// a transformer might need more source bytes after RuneError, meaning that
|
||||
// the only way to pass it safely is to create a new buffer and manage the
|
||||
// intermingling of RuneErrors and normal input.
|
||||
// - Many transformers leave ill-formed UTF-8 as is, so this is not
|
||||
// inconsistent. Generally ill-formed UTF-8 is only replaced if it is a
|
||||
// logical consequence of the operation (as for Map) or if it otherwise would
|
||||
// pose security concerns (as for Remove).
|
||||
// - An alternative would be to return an error on ill-formed UTF-8, but this
|
||||
// would be inconsistent with other operations.
|
||||
|
||||
// If returns a transformer that applies tIn to consecutive runes for which
|
||||
// s.Contains(r) and tNotIn to consecutive runes for which !s.Contains(r). Reset
|
||||
// is called on tIn and tNotIn at the start of each run. A Nop transformer will
|
||||
// substitute a nil value passed to tIn or tNotIn. Invalid UTF-8 is translated
|
||||
// to RuneError to determine which transformer to apply, but is passed as is to
|
||||
// the respective transformer.
|
||||
func If(s Set, tIn, tNotIn transform.Transformer) Transformer {
|
||||
if tIn == nil && tNotIn == nil {
|
||||
return Transformer{transform.Nop}
|
||||
}
|
||||
if tIn == nil {
|
||||
tIn = transform.Nop
|
||||
}
|
||||
if tNotIn == nil {
|
||||
tNotIn = transform.Nop
|
||||
}
|
||||
sIn, ok := tIn.(transform.SpanningTransformer)
|
||||
if !ok {
|
||||
sIn = dummySpan{tIn}
|
||||
}
|
||||
sNotIn, ok := tNotIn.(transform.SpanningTransformer)
|
||||
if !ok {
|
||||
sNotIn = dummySpan{tNotIn}
|
||||
}
|
||||
|
||||
a := &cond{
|
||||
tIn: sIn,
|
||||
tNotIn: sNotIn,
|
||||
f: s.Contains,
|
||||
}
|
||||
a.Reset()
|
||||
return Transformer{a}
|
||||
}
|
||||
|
||||
type dummySpan struct{ transform.Transformer }
|
||||
|
||||
func (d dummySpan) Span(src []byte, atEOF bool) (n int, err error) {
|
||||
return 0, transform.ErrEndOfSpan
|
||||
}
|
||||
|
||||
type cond struct {
|
||||
tIn, tNotIn transform.SpanningTransformer
|
||||
f func(rune) bool
|
||||
check func(rune) bool // current check to perform
|
||||
t transform.SpanningTransformer // current transformer to use
|
||||
}
|
||||
|
||||
// Reset implements transform.Transformer.
|
||||
func (t *cond) Reset() {
|
||||
t.check = t.is
|
||||
t.t = t.tIn
|
||||
t.t.Reset() // notIn will be reset on first usage.
|
||||
}
|
||||
|
||||
func (t *cond) is(r rune) bool {
|
||||
if t.f(r) {
|
||||
return true
|
||||
}
|
||||
t.check = t.isNot
|
||||
t.t = t.tNotIn
|
||||
t.tNotIn.Reset()
|
||||
return false
|
||||
}
|
||||
|
||||
func (t *cond) isNot(r rune) bool {
|
||||
if !t.f(r) {
|
||||
return true
|
||||
}
|
||||
t.check = t.is
|
||||
t.t = t.tIn
|
||||
t.tIn.Reset()
|
||||
return false
|
||||
}
|
||||
|
||||
// This implementation of Span doesn't help all too much, but it needs to be
|
||||
// there to satisfy this package's Transformer interface.
|
||||
// TODO: there are certainly room for improvements, though. For example, if
|
||||
// t.t == transform.Nop (which will a common occurrence) it will save a bundle
|
||||
// to special-case that loop.
|
||||
func (t *cond) Span(src []byte, atEOF bool) (n int, err error) {
|
||||
p := 0
|
||||
for n < len(src) && err == nil {
|
||||
// Don't process too much at a time as the Spanner that will be
|
||||
// called on this block may terminate early.
|
||||
const maxChunk = 4096
|
||||
max := len(src)
|
||||
if v := n + maxChunk; v < max {
|
||||
max = v
|
||||
}
|
||||
atEnd := false
|
||||
size := 0
|
||||
current := t.t
|
||||
for ; p < max; p += size {
|
||||
r := rune(src[p])
|
||||
if r < utf8.RuneSelf {
|
||||
size = 1
|
||||
} else if r, size = utf8.DecodeRune(src[p:]); size == 1 {
|
||||
if !atEOF && !utf8.FullRune(src[p:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
}
|
||||
if !t.check(r) {
|
||||
// The next rune will be the start of a new run.
|
||||
atEnd = true
|
||||
break
|
||||
}
|
||||
}
|
||||
n2, err2 := current.Span(src[n:p], atEnd || (atEOF && p == len(src)))
|
||||
n += n2
|
||||
if err2 != nil {
|
||||
return n, err2
|
||||
}
|
||||
// At this point either err != nil or t.check will pass for the rune at p.
|
||||
p = n + size
|
||||
}
|
||||
return n, err
|
||||
}
|
||||
|
||||
func (t *cond) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
p := 0
|
||||
for nSrc < len(src) && err == nil {
|
||||
// Don't process too much at a time, as the work might be wasted if the
|
||||
// destination buffer isn't large enough to hold the result or a
|
||||
// transform returns an error early.
|
||||
const maxChunk = 4096
|
||||
max := len(src)
|
||||
if n := nSrc + maxChunk; n < len(src) {
|
||||
max = n
|
||||
}
|
||||
atEnd := false
|
||||
size := 0
|
||||
current := t.t
|
||||
for ; p < max; p += size {
|
||||
r := rune(src[p])
|
||||
if r < utf8.RuneSelf {
|
||||
size = 1
|
||||
} else if r, size = utf8.DecodeRune(src[p:]); size == 1 {
|
||||
if !atEOF && !utf8.FullRune(src[p:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
}
|
||||
if !t.check(r) {
|
||||
// The next rune will be the start of a new run.
|
||||
atEnd = true
|
||||
break
|
||||
}
|
||||
}
|
||||
nDst2, nSrc2, err2 := current.Transform(dst[nDst:], src[nSrc:p], atEnd || (atEOF && p == len(src)))
|
||||
nDst += nDst2
|
||||
nSrc += nSrc2
|
||||
if err2 != nil {
|
||||
return nDst, nSrc, err2
|
||||
}
|
||||
// At this point either err != nil or t.check will pass for the rune at p.
|
||||
p = nSrc + size
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
+355
@@ -0,0 +1,355 @@
|
||||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package runes provide transforms for UTF-8 encoded text.
|
||||
package runes // import "golang.org/x/text/runes"
|
||||
|
||||
import (
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
// A Set is a collection of runes.
|
||||
type Set interface {
|
||||
// Contains returns true if r is contained in the set.
|
||||
Contains(r rune) bool
|
||||
}
|
||||
|
||||
type setFunc func(rune) bool
|
||||
|
||||
func (s setFunc) Contains(r rune) bool {
|
||||
return s(r)
|
||||
}
|
||||
|
||||
// Note: using funcs here instead of wrapping types result in cleaner
|
||||
// documentation and a smaller API.
|
||||
|
||||
// In creates a Set with a Contains method that returns true for all runes in
|
||||
// the given RangeTable.
|
||||
func In(rt *unicode.RangeTable) Set {
|
||||
return setFunc(func(r rune) bool { return unicode.Is(rt, r) })
|
||||
}
|
||||
|
||||
// In creates a Set with a Contains method that returns true for all runes not
|
||||
// in the given RangeTable.
|
||||
func NotIn(rt *unicode.RangeTable) Set {
|
||||
return setFunc(func(r rune) bool { return !unicode.Is(rt, r) })
|
||||
}
|
||||
|
||||
// Predicate creates a Set with a Contains method that returns f(r).
|
||||
func Predicate(f func(rune) bool) Set {
|
||||
return setFunc(f)
|
||||
}
|
||||
|
||||
// Transformer implements the transform.Transformer interface.
|
||||
type Transformer struct {
|
||||
t transform.SpanningTransformer
|
||||
}
|
||||
|
||||
func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
return t.t.Transform(dst, src, atEOF)
|
||||
}
|
||||
|
||||
func (t Transformer) Span(b []byte, atEOF bool) (n int, err error) {
|
||||
return t.t.Span(b, atEOF)
|
||||
}
|
||||
|
||||
func (t Transformer) Reset() { t.t.Reset() }
|
||||
|
||||
// Bytes returns a new byte slice with the result of converting b using t. It
|
||||
// calls Reset on t. It returns nil if any error was found. This can only happen
|
||||
// if an error-producing Transformer is passed to If.
|
||||
func (t Transformer) Bytes(b []byte) []byte {
|
||||
b, _, err := transform.Bytes(t, b)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
// String returns a string with the result of converting s using t. It calls
|
||||
// Reset on t. It returns the empty string if any error was found. This can only
|
||||
// happen if an error-producing Transformer is passed to If.
|
||||
func (t Transformer) String(s string) string {
|
||||
s, _, err := transform.String(t, s)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
// TODO:
|
||||
// - Copy: copying strings and bytes in whole-rune units.
|
||||
// - Validation (maybe)
|
||||
// - Well-formed-ness (maybe)
|
||||
|
||||
const runeErrorString = string(utf8.RuneError)
|
||||
|
||||
// Remove returns a Transformer that removes runes r for which s.Contains(r).
|
||||
// Illegal input bytes are replaced by RuneError before being passed to f.
|
||||
func Remove(s Set) Transformer {
|
||||
if f, ok := s.(setFunc); ok {
|
||||
// This little trick cuts the running time of BenchmarkRemove for sets
|
||||
// created by Predicate roughly in half.
|
||||
// TODO: special-case RangeTables as well.
|
||||
return Transformer{remove(f)}
|
||||
}
|
||||
return Transformer{remove(s.Contains)}
|
||||
}
|
||||
|
||||
// TODO: remove transform.RemoveFunc.
|
||||
|
||||
type remove func(r rune) bool
|
||||
|
||||
func (remove) Reset() {}
|
||||
|
||||
// Span implements transform.Spanner.
|
||||
func (t remove) Span(src []byte, atEOF bool) (n int, err error) {
|
||||
for r, size := rune(0), 0; n < len(src); {
|
||||
if r = rune(src[n]); r < utf8.RuneSelf {
|
||||
size = 1
|
||||
} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
|
||||
// Invalid rune.
|
||||
if !atEOF && !utf8.FullRune(src[n:]) {
|
||||
err = transform.ErrShortSrc
|
||||
} else {
|
||||
err = transform.ErrEndOfSpan
|
||||
}
|
||||
break
|
||||
}
|
||||
if t(r) {
|
||||
err = transform.ErrEndOfSpan
|
||||
break
|
||||
}
|
||||
n += size
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Transform implements transform.Transformer.
|
||||
func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
for r, size := rune(0), 0; nSrc < len(src); {
|
||||
if r = rune(src[nSrc]); r < utf8.RuneSelf {
|
||||
size = 1
|
||||
} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
|
||||
// Invalid rune.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
// We replace illegal bytes with RuneError. Not doing so might
|
||||
// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
|
||||
// The resulting byte sequence may subsequently contain runes
|
||||
// for which t(r) is true that were passed unnoticed.
|
||||
if !t(utf8.RuneError) {
|
||||
if nDst+3 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+0] = runeErrorString[0]
|
||||
dst[nDst+1] = runeErrorString[1]
|
||||
dst[nDst+2] = runeErrorString[2]
|
||||
nDst += 3
|
||||
}
|
||||
nSrc++
|
||||
continue
|
||||
}
|
||||
if t(r) {
|
||||
nSrc += size
|
||||
continue
|
||||
}
|
||||
if nDst+size > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
for i := 0; i < size; i++ {
|
||||
dst[nDst] = src[nSrc]
|
||||
nDst++
|
||||
nSrc++
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Map returns a Transformer that maps the runes in the input using the given
|
||||
// mapping. Illegal bytes in the input are converted to utf8.RuneError before
|
||||
// being passed to the mapping func.
|
||||
func Map(mapping func(rune) rune) Transformer {
|
||||
return Transformer{mapper(mapping)}
|
||||
}
|
||||
|
||||
type mapper func(rune) rune
|
||||
|
||||
func (mapper) Reset() {}
|
||||
|
||||
// Span implements transform.Spanner.
|
||||
func (t mapper) Span(src []byte, atEOF bool) (n int, err error) {
|
||||
for r, size := rune(0), 0; n < len(src); n += size {
|
||||
if r = rune(src[n]); r < utf8.RuneSelf {
|
||||
size = 1
|
||||
} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
|
||||
// Invalid rune.
|
||||
if !atEOF && !utf8.FullRune(src[n:]) {
|
||||
err = transform.ErrShortSrc
|
||||
} else {
|
||||
err = transform.ErrEndOfSpan
|
||||
}
|
||||
break
|
||||
}
|
||||
if t(r) != r {
|
||||
err = transform.ErrEndOfSpan
|
||||
break
|
||||
}
|
||||
}
|
||||
return n, err
|
||||
}
|
||||
|
||||
// Transform implements transform.Transformer.
|
||||
func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
var replacement rune
|
||||
var b [utf8.UTFMax]byte
|
||||
|
||||
for r, size := rune(0), 0; nSrc < len(src); {
|
||||
if r = rune(src[nSrc]); r < utf8.RuneSelf {
|
||||
if replacement = t(r); replacement < utf8.RuneSelf {
|
||||
if nDst == len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst] = byte(replacement)
|
||||
nDst++
|
||||
nSrc++
|
||||
continue
|
||||
}
|
||||
size = 1
|
||||
} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
|
||||
// Invalid rune.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
|
||||
if replacement = t(utf8.RuneError); replacement == utf8.RuneError {
|
||||
if nDst+3 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+0] = runeErrorString[0]
|
||||
dst[nDst+1] = runeErrorString[1]
|
||||
dst[nDst+2] = runeErrorString[2]
|
||||
nDst += 3
|
||||
nSrc++
|
||||
continue
|
||||
}
|
||||
} else if replacement = t(r); replacement == r {
|
||||
if nDst+size > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
for i := 0; i < size; i++ {
|
||||
dst[nDst] = src[nSrc]
|
||||
nDst++
|
||||
nSrc++
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
n := utf8.EncodeRune(b[:], replacement)
|
||||
|
||||
if nDst+n > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
for i := 0; i < n; i++ {
|
||||
dst[nDst] = b[i]
|
||||
nDst++
|
||||
}
|
||||
nSrc += size
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// ReplaceIllFormed returns a transformer that replaces all input bytes that are
|
||||
// not part of a well-formed UTF-8 code sequence with utf8.RuneError.
|
||||
func ReplaceIllFormed() Transformer {
|
||||
return Transformer{&replaceIllFormed{}}
|
||||
}
|
||||
|
||||
type replaceIllFormed struct{ transform.NopResetter }
|
||||
|
||||
func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) {
|
||||
for n < len(src) {
|
||||
// ASCII fast path.
|
||||
if src[n] < utf8.RuneSelf {
|
||||
n++
|
||||
continue
|
||||
}
|
||||
|
||||
r, size := utf8.DecodeRune(src[n:])
|
||||
|
||||
// Look for a valid non-ASCII rune.
|
||||
if r != utf8.RuneError || size != 1 {
|
||||
n += size
|
||||
continue
|
||||
}
|
||||
|
||||
// Look for short source data.
|
||||
if !atEOF && !utf8.FullRune(src[n:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
|
||||
// We have an invalid rune.
|
||||
err = transform.ErrEndOfSpan
|
||||
break
|
||||
}
|
||||
return n, err
|
||||
}
|
||||
|
||||
func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
for nSrc < len(src) {
|
||||
// ASCII fast path.
|
||||
if r := src[nSrc]; r < utf8.RuneSelf {
|
||||
if nDst == len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst] = r
|
||||
nDst++
|
||||
nSrc++
|
||||
continue
|
||||
}
|
||||
|
||||
// Look for a valid non-ASCII rune.
|
||||
if _, size := utf8.DecodeRune(src[nSrc:]); size != 1 {
|
||||
if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
nDst += size
|
||||
nSrc += size
|
||||
continue
|
||||
}
|
||||
|
||||
// Look for short source data.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) {
|
||||
err = transform.ErrShortSrc
|
||||
break
|
||||
}
|
||||
|
||||
// We have an invalid rune.
|
||||
if nDst+3 > len(dst) {
|
||||
err = transform.ErrShortDst
|
||||
break
|
||||
}
|
||||
dst[nDst+0] = runeErrorString[0]
|
||||
dst[nDst+1] = runeErrorString[1]
|
||||
dst[nDst+2] = runeErrorString[2]
|
||||
nDst += 3
|
||||
nSrc++
|
||||
}
|
||||
return nDst, nSrc, err
|
||||
}
|
||||
+27
@@ -0,0 +1,27 @@
|
||||
Copyright (c) 2009 The Go Authors. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Google Inc. nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
+705
@@ -0,0 +1,705 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package transform provides reader and writer wrappers that transform the
|
||||
// bytes passing through as well as various transformations. Example
|
||||
// transformations provided by other packages include normalization and
|
||||
// conversion between character sets.
|
||||
package transform // import "golang.org/x/text/transform"
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"io"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
var (
|
||||
// ErrShortDst means that the destination buffer was too short to
|
||||
// receive all of the transformed bytes.
|
||||
ErrShortDst = errors.New("transform: short destination buffer")
|
||||
|
||||
// ErrShortSrc means that the source buffer has insufficient data to
|
||||
// complete the transformation.
|
||||
ErrShortSrc = errors.New("transform: short source buffer")
|
||||
|
||||
// ErrEndOfSpan means that the input and output (the transformed input)
|
||||
// are not identical.
|
||||
ErrEndOfSpan = errors.New("transform: input and output are not identical")
|
||||
|
||||
// errInconsistentByteCount means that Transform returned success (nil
|
||||
// error) but also returned nSrc inconsistent with the src argument.
|
||||
errInconsistentByteCount = errors.New("transform: inconsistent byte count returned")
|
||||
|
||||
// errShortInternal means that an internal buffer is not large enough
|
||||
// to make progress and the Transform operation must be aborted.
|
||||
errShortInternal = errors.New("transform: short internal buffer")
|
||||
)
|
||||
|
||||
// Transformer transforms bytes.
|
||||
type Transformer interface {
|
||||
// Transform writes to dst the transformed bytes read from src, and
|
||||
// returns the number of dst bytes written and src bytes read. The
|
||||
// atEOF argument tells whether src represents the last bytes of the
|
||||
// input.
|
||||
//
|
||||
// Callers should always process the nDst bytes produced and account
|
||||
// for the nSrc bytes consumed before considering the error err.
|
||||
//
|
||||
// A nil error means that all of the transformed bytes (whether freshly
|
||||
// transformed from src or left over from previous Transform calls)
|
||||
// were written to dst. A nil error can be returned regardless of
|
||||
// whether atEOF is true. If err is nil then nSrc must equal len(src);
|
||||
// the converse is not necessarily true.
|
||||
//
|
||||
// ErrShortDst means that dst was too short to receive all of the
|
||||
// transformed bytes. ErrShortSrc means that src had insufficient data
|
||||
// to complete the transformation. If both conditions apply, then
|
||||
// either error may be returned. Other than the error conditions listed
|
||||
// here, implementations are free to report other errors that arise.
|
||||
Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error)
|
||||
|
||||
// Reset resets the state and allows a Transformer to be reused.
|
||||
Reset()
|
||||
}
|
||||
|
||||
// SpanningTransformer extends the Transformer interface with a Span method
|
||||
// that determines how much of the input already conforms to the Transformer.
|
||||
type SpanningTransformer interface {
|
||||
Transformer
|
||||
|
||||
// Span returns a position in src such that transforming src[:n] results in
|
||||
// identical output src[:n] for these bytes. It does not necessarily return
|
||||
// the largest such n. The atEOF argument tells whether src represents the
|
||||
// last bytes of the input.
|
||||
//
|
||||
// Callers should always account for the n bytes consumed before
|
||||
// considering the error err.
|
||||
//
|
||||
// A nil error means that all input bytes are known to be identical to the
|
||||
// output produced by the Transformer. A nil error can be be returned
|
||||
// regardless of whether atEOF is true. If err is nil, then then n must
|
||||
// equal len(src); the converse is not necessarily true.
|
||||
//
|
||||
// ErrEndOfSpan means that the Transformer output may differ from the
|
||||
// input after n bytes. Note that n may be len(src), meaning that the output
|
||||
// would contain additional bytes after otherwise identical output.
|
||||
// ErrShortSrc means that src had insufficient data to determine whether the
|
||||
// remaining bytes would change. Other than the error conditions listed
|
||||
// here, implementations are free to report other errors that arise.
|
||||
//
|
||||
// Calling Span can modify the Transformer state as a side effect. In
|
||||
// effect, it does the transformation just as calling Transform would, only
|
||||
// without copying to a destination buffer and only up to a point it can
|
||||
// determine the input and output bytes are the same. This is obviously more
|
||||
// limited than calling Transform, but can be more efficient in terms of
|
||||
// copying and allocating buffers. Calls to Span and Transform may be
|
||||
// interleaved.
|
||||
Span(src []byte, atEOF bool) (n int, err error)
|
||||
}
|
||||
|
||||
// NopResetter can be embedded by implementations of Transformer to add a nop
|
||||
// Reset method.
|
||||
type NopResetter struct{}
|
||||
|
||||
// Reset implements the Reset method of the Transformer interface.
|
||||
func (NopResetter) Reset() {}
|
||||
|
||||
// Reader wraps another io.Reader by transforming the bytes read.
|
||||
type Reader struct {
|
||||
r io.Reader
|
||||
t Transformer
|
||||
err error
|
||||
|
||||
// dst[dst0:dst1] contains bytes that have been transformed by t but
|
||||
// not yet copied out via Read.
|
||||
dst []byte
|
||||
dst0, dst1 int
|
||||
|
||||
// src[src0:src1] contains bytes that have been read from r but not
|
||||
// yet transformed through t.
|
||||
src []byte
|
||||
src0, src1 int
|
||||
|
||||
// transformComplete is whether the transformation is complete,
|
||||
// regardless of whether or not it was successful.
|
||||
transformComplete bool
|
||||
}
|
||||
|
||||
const defaultBufSize = 4096
|
||||
|
||||
// NewReader returns a new Reader that wraps r by transforming the bytes read
|
||||
// via t. It calls Reset on t.
|
||||
func NewReader(r io.Reader, t Transformer) *Reader {
|
||||
t.Reset()
|
||||
return &Reader{
|
||||
r: r,
|
||||
t: t,
|
||||
dst: make([]byte, defaultBufSize),
|
||||
src: make([]byte, defaultBufSize),
|
||||
}
|
||||
}
|
||||
|
||||
// Read implements the io.Reader interface.
|
||||
func (r *Reader) Read(p []byte) (int, error) {
|
||||
n, err := 0, error(nil)
|
||||
for {
|
||||
// Copy out any transformed bytes and return the final error if we are done.
|
||||
if r.dst0 != r.dst1 {
|
||||
n = copy(p, r.dst[r.dst0:r.dst1])
|
||||
r.dst0 += n
|
||||
if r.dst0 == r.dst1 && r.transformComplete {
|
||||
return n, r.err
|
||||
}
|
||||
return n, nil
|
||||
} else if r.transformComplete {
|
||||
return 0, r.err
|
||||
}
|
||||
|
||||
// Try to transform some source bytes, or to flush the transformer if we
|
||||
// are out of source bytes. We do this even if r.r.Read returned an error.
|
||||
// As the io.Reader documentation says, "process the n > 0 bytes returned
|
||||
// before considering the error".
|
||||
if r.src0 != r.src1 || r.err != nil {
|
||||
r.dst0 = 0
|
||||
r.dst1, n, err = r.t.Transform(r.dst, r.src[r.src0:r.src1], r.err == io.EOF)
|
||||
r.src0 += n
|
||||
|
||||
switch {
|
||||
case err == nil:
|
||||
if r.src0 != r.src1 {
|
||||
r.err = errInconsistentByteCount
|
||||
}
|
||||
// The Transform call was successful; we are complete if we
|
||||
// cannot read more bytes into src.
|
||||
r.transformComplete = r.err != nil
|
||||
continue
|
||||
case err == ErrShortDst && (r.dst1 != 0 || n != 0):
|
||||
// Make room in dst by copying out, and try again.
|
||||
continue
|
||||
case err == ErrShortSrc && r.src1-r.src0 != len(r.src) && r.err == nil:
|
||||
// Read more bytes into src via the code below, and try again.
|
||||
default:
|
||||
r.transformComplete = true
|
||||
// The reader error (r.err) takes precedence over the
|
||||
// transformer error (err) unless r.err is nil or io.EOF.
|
||||
if r.err == nil || r.err == io.EOF {
|
||||
r.err = err
|
||||
}
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Move any untransformed source bytes to the start of the buffer
|
||||
// and read more bytes.
|
||||
if r.src0 != 0 {
|
||||
r.src0, r.src1 = 0, copy(r.src, r.src[r.src0:r.src1])
|
||||
}
|
||||
n, r.err = r.r.Read(r.src[r.src1:])
|
||||
r.src1 += n
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: implement ReadByte (and ReadRune??).
|
||||
|
||||
// Writer wraps another io.Writer by transforming the bytes read.
|
||||
// The user needs to call Close to flush unwritten bytes that may
|
||||
// be buffered.
|
||||
type Writer struct {
|
||||
w io.Writer
|
||||
t Transformer
|
||||
dst []byte
|
||||
|
||||
// src[:n] contains bytes that have not yet passed through t.
|
||||
src []byte
|
||||
n int
|
||||
}
|
||||
|
||||
// NewWriter returns a new Writer that wraps w by transforming the bytes written
|
||||
// via t. It calls Reset on t.
|
||||
func NewWriter(w io.Writer, t Transformer) *Writer {
|
||||
t.Reset()
|
||||
return &Writer{
|
||||
w: w,
|
||||
t: t,
|
||||
dst: make([]byte, defaultBufSize),
|
||||
src: make([]byte, defaultBufSize),
|
||||
}
|
||||
}
|
||||
|
||||
// Write implements the io.Writer interface. If there are not enough
|
||||
// bytes available to complete a Transform, the bytes will be buffered
|
||||
// for the next write. Call Close to convert the remaining bytes.
|
||||
func (w *Writer) Write(data []byte) (n int, err error) {
|
||||
src := data
|
||||
if w.n > 0 {
|
||||
// Append bytes from data to the last remainder.
|
||||
// TODO: limit the amount copied on first try.
|
||||
n = copy(w.src[w.n:], data)
|
||||
w.n += n
|
||||
src = w.src[:w.n]
|
||||
}
|
||||
for {
|
||||
nDst, nSrc, err := w.t.Transform(w.dst, src, false)
|
||||
if _, werr := w.w.Write(w.dst[:nDst]); werr != nil {
|
||||
return n, werr
|
||||
}
|
||||
src = src[nSrc:]
|
||||
if w.n == 0 {
|
||||
n += nSrc
|
||||
} else if len(src) <= n {
|
||||
// Enough bytes from w.src have been consumed. We make src point
|
||||
// to data instead to reduce the copying.
|
||||
w.n = 0
|
||||
n -= len(src)
|
||||
src = data[n:]
|
||||
if n < len(data) && (err == nil || err == ErrShortSrc) {
|
||||
continue
|
||||
}
|
||||
}
|
||||
switch err {
|
||||
case ErrShortDst:
|
||||
// This error is okay as long as we are making progress.
|
||||
if nDst > 0 || nSrc > 0 {
|
||||
continue
|
||||
}
|
||||
case ErrShortSrc:
|
||||
if len(src) < len(w.src) {
|
||||
m := copy(w.src, src)
|
||||
// If w.n > 0, bytes from data were already copied to w.src and n
|
||||
// was already set to the number of bytes consumed.
|
||||
if w.n == 0 {
|
||||
n += m
|
||||
}
|
||||
w.n = m
|
||||
err = nil
|
||||
} else if nDst > 0 || nSrc > 0 {
|
||||
// Not enough buffer to store the remainder. Keep processing as
|
||||
// long as there is progress. Without this case, transforms that
|
||||
// require a lookahead larger than the buffer may result in an
|
||||
// error. This is not something one may expect to be common in
|
||||
// practice, but it may occur when buffers are set to small
|
||||
// sizes during testing.
|
||||
continue
|
||||
}
|
||||
case nil:
|
||||
if w.n > 0 {
|
||||
err = errInconsistentByteCount
|
||||
}
|
||||
}
|
||||
return n, err
|
||||
}
|
||||
}
|
||||
|
||||
// Close implements the io.Closer interface.
|
||||
func (w *Writer) Close() error {
|
||||
src := w.src[:w.n]
|
||||
for {
|
||||
nDst, nSrc, err := w.t.Transform(w.dst, src, true)
|
||||
if _, werr := w.w.Write(w.dst[:nDst]); werr != nil {
|
||||
return werr
|
||||
}
|
||||
if err != ErrShortDst {
|
||||
return err
|
||||
}
|
||||
src = src[nSrc:]
|
||||
}
|
||||
}
|
||||
|
||||
type nop struct{ NopResetter }
|
||||
|
||||
func (nop) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
n := copy(dst, src)
|
||||
if n < len(src) {
|
||||
err = ErrShortDst
|
||||
}
|
||||
return n, n, err
|
||||
}
|
||||
|
||||
func (nop) Span(src []byte, atEOF bool) (n int, err error) {
|
||||
return len(src), nil
|
||||
}
|
||||
|
||||
type discard struct{ NopResetter }
|
||||
|
||||
func (discard) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
return 0, len(src), nil
|
||||
}
|
||||
|
||||
var (
|
||||
// Discard is a Transformer for which all Transform calls succeed
|
||||
// by consuming all bytes and writing nothing.
|
||||
Discard Transformer = discard{}
|
||||
|
||||
// Nop is a SpanningTransformer that copies src to dst.
|
||||
Nop SpanningTransformer = nop{}
|
||||
)
|
||||
|
||||
// chain is a sequence of links. A chain with N Transformers has N+1 links and
|
||||
// N+1 buffers. Of those N+1 buffers, the first and last are the src and dst
|
||||
// buffers given to chain.Transform and the middle N-1 buffers are intermediate
|
||||
// buffers owned by the chain. The i'th link transforms bytes from the i'th
|
||||
// buffer chain.link[i].b at read offset chain.link[i].p to the i+1'th buffer
|
||||
// chain.link[i+1].b at write offset chain.link[i+1].n, for i in [0, N).
|
||||
type chain struct {
|
||||
link []link
|
||||
err error
|
||||
// errStart is the index at which the error occurred plus 1. Processing
|
||||
// errStart at this level at the next call to Transform. As long as
|
||||
// errStart > 0, chain will not consume any more source bytes.
|
||||
errStart int
|
||||
}
|
||||
|
||||
func (c *chain) fatalError(errIndex int, err error) {
|
||||
if i := errIndex + 1; i > c.errStart {
|
||||
c.errStart = i
|
||||
c.err = err
|
||||
}
|
||||
}
|
||||
|
||||
type link struct {
|
||||
t Transformer
|
||||
// b[p:n] holds the bytes to be transformed by t.
|
||||
b []byte
|
||||
p int
|
||||
n int
|
||||
}
|
||||
|
||||
func (l *link) src() []byte {
|
||||
return l.b[l.p:l.n]
|
||||
}
|
||||
|
||||
func (l *link) dst() []byte {
|
||||
return l.b[l.n:]
|
||||
}
|
||||
|
||||
// Chain returns a Transformer that applies t in sequence.
|
||||
func Chain(t ...Transformer) Transformer {
|
||||
if len(t) == 0 {
|
||||
return nop{}
|
||||
}
|
||||
c := &chain{link: make([]link, len(t)+1)}
|
||||
for i, tt := range t {
|
||||
c.link[i].t = tt
|
||||
}
|
||||
// Allocate intermediate buffers.
|
||||
b := make([][defaultBufSize]byte, len(t)-1)
|
||||
for i := range b {
|
||||
c.link[i+1].b = b[i][:]
|
||||
}
|
||||
return c
|
||||
}
|
||||
|
||||
// Reset resets the state of Chain. It calls Reset on all the Transformers.
|
||||
func (c *chain) Reset() {
|
||||
for i, l := range c.link {
|
||||
if l.t != nil {
|
||||
l.t.Reset()
|
||||
}
|
||||
c.link[i].p, c.link[i].n = 0, 0
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: make chain use Span (is going to be fun to implement!)
|
||||
|
||||
// Transform applies the transformers of c in sequence.
|
||||
func (c *chain) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
// Set up src and dst in the chain.
|
||||
srcL := &c.link[0]
|
||||
dstL := &c.link[len(c.link)-1]
|
||||
srcL.b, srcL.p, srcL.n = src, 0, len(src)
|
||||
dstL.b, dstL.n = dst, 0
|
||||
var lastFull, needProgress bool // for detecting progress
|
||||
|
||||
// i is the index of the next Transformer to apply, for i in [low, high].
|
||||
// low is the lowest index for which c.link[low] may still produce bytes.
|
||||
// high is the highest index for which c.link[high] has a Transformer.
|
||||
// The error returned by Transform determines whether to increase or
|
||||
// decrease i. We try to completely fill a buffer before converting it.
|
||||
for low, i, high := c.errStart, c.errStart, len(c.link)-2; low <= i && i <= high; {
|
||||
in, out := &c.link[i], &c.link[i+1]
|
||||
nDst, nSrc, err0 := in.t.Transform(out.dst(), in.src(), atEOF && low == i)
|
||||
out.n += nDst
|
||||
in.p += nSrc
|
||||
if i > 0 && in.p == in.n {
|
||||
in.p, in.n = 0, 0
|
||||
}
|
||||
needProgress, lastFull = lastFull, false
|
||||
switch err0 {
|
||||
case ErrShortDst:
|
||||
// Process the destination buffer next. Return if we are already
|
||||
// at the high index.
|
||||
if i == high {
|
||||
return dstL.n, srcL.p, ErrShortDst
|
||||
}
|
||||
if out.n != 0 {
|
||||
i++
|
||||
// If the Transformer at the next index is not able to process any
|
||||
// source bytes there is nothing that can be done to make progress
|
||||
// and the bytes will remain unprocessed. lastFull is used to
|
||||
// detect this and break out of the loop with a fatal error.
|
||||
lastFull = true
|
||||
continue
|
||||
}
|
||||
// The destination buffer was too small, but is completely empty.
|
||||
// Return a fatal error as this transformation can never complete.
|
||||
c.fatalError(i, errShortInternal)
|
||||
case ErrShortSrc:
|
||||
if i == 0 {
|
||||
// Save ErrShortSrc in err. All other errors take precedence.
|
||||
err = ErrShortSrc
|
||||
break
|
||||
}
|
||||
// Source bytes were depleted before filling up the destination buffer.
|
||||
// Verify we made some progress, move the remaining bytes to the errStart
|
||||
// and try to get more source bytes.
|
||||
if needProgress && nSrc == 0 || in.n-in.p == len(in.b) {
|
||||
// There were not enough source bytes to proceed while the source
|
||||
// buffer cannot hold any more bytes. Return a fatal error as this
|
||||
// transformation can never complete.
|
||||
c.fatalError(i, errShortInternal)
|
||||
break
|
||||
}
|
||||
// in.b is an internal buffer and we can make progress.
|
||||
in.p, in.n = 0, copy(in.b, in.src())
|
||||
fallthrough
|
||||
case nil:
|
||||
// if i == low, we have depleted the bytes at index i or any lower levels.
|
||||
// In that case we increase low and i. In all other cases we decrease i to
|
||||
// fetch more bytes before proceeding to the next index.
|
||||
if i > low {
|
||||
i--
|
||||
continue
|
||||
}
|
||||
default:
|
||||
c.fatalError(i, err0)
|
||||
}
|
||||
// Exhausted level low or fatal error: increase low and continue
|
||||
// to process the bytes accepted so far.
|
||||
i++
|
||||
low = i
|
||||
}
|
||||
|
||||
// If c.errStart > 0, this means we found a fatal error. We will clear
|
||||
// all upstream buffers. At this point, no more progress can be made
|
||||
// downstream, as Transform would have bailed while handling ErrShortDst.
|
||||
if c.errStart > 0 {
|
||||
for i := 1; i < c.errStart; i++ {
|
||||
c.link[i].p, c.link[i].n = 0, 0
|
||||
}
|
||||
err, c.errStart, c.err = c.err, 0, nil
|
||||
}
|
||||
return dstL.n, srcL.p, err
|
||||
}
|
||||
|
||||
// Deprecated: use runes.Remove instead.
|
||||
func RemoveFunc(f func(r rune) bool) Transformer {
|
||||
return removeF(f)
|
||||
}
|
||||
|
||||
type removeF func(r rune) bool
|
||||
|
||||
func (removeF) Reset() {}
|
||||
|
||||
// Transform implements the Transformer interface.
|
||||
func (t removeF) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||||
for r, sz := rune(0), 0; len(src) > 0; src = src[sz:] {
|
||||
|
||||
if r = rune(src[0]); r < utf8.RuneSelf {
|
||||
sz = 1
|
||||
} else {
|
||||
r, sz = utf8.DecodeRune(src)
|
||||
|
||||
if sz == 1 {
|
||||
// Invalid rune.
|
||||
if !atEOF && !utf8.FullRune(src) {
|
||||
err = ErrShortSrc
|
||||
break
|
||||
}
|
||||
// We replace illegal bytes with RuneError. Not doing so might
|
||||
// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
|
||||
// The resulting byte sequence may subsequently contain runes
|
||||
// for which t(r) is true that were passed unnoticed.
|
||||
if !t(r) {
|
||||
if nDst+3 > len(dst) {
|
||||
err = ErrShortDst
|
||||
break
|
||||
}
|
||||
nDst += copy(dst[nDst:], "\uFFFD")
|
||||
}
|
||||
nSrc++
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
if !t(r) {
|
||||
if nDst+sz > len(dst) {
|
||||
err = ErrShortDst
|
||||
break
|
||||
}
|
||||
nDst += copy(dst[nDst:], src[:sz])
|
||||
}
|
||||
nSrc += sz
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// grow returns a new []byte that is longer than b, and copies the first n bytes
|
||||
// of b to the start of the new slice.
|
||||
func grow(b []byte, n int) []byte {
|
||||
m := len(b)
|
||||
if m <= 32 {
|
||||
m = 64
|
||||
} else if m <= 256 {
|
||||
m *= 2
|
||||
} else {
|
||||
m += m >> 1
|
||||
}
|
||||
buf := make([]byte, m)
|
||||
copy(buf, b[:n])
|
||||
return buf
|
||||
}
|
||||
|
||||
const initialBufSize = 128
|
||||
|
||||
// String returns a string with the result of converting s[:n] using t, where
|
||||
// n <= len(s). If err == nil, n will be len(s). It calls Reset on t.
|
||||
func String(t Transformer, s string) (result string, n int, err error) {
|
||||
t.Reset()
|
||||
if s == "" {
|
||||
// Fast path for the common case for empty input. Results in about a
|
||||
// 86% reduction of running time for BenchmarkStringLowerEmpty.
|
||||
if _, _, err := t.Transform(nil, nil, true); err == nil {
|
||||
return "", 0, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Allocate only once. Note that both dst and src escape when passed to
|
||||
// Transform.
|
||||
buf := [2 * initialBufSize]byte{}
|
||||
dst := buf[:initialBufSize:initialBufSize]
|
||||
src := buf[initialBufSize : 2*initialBufSize]
|
||||
|
||||
// The input string s is transformed in multiple chunks (starting with a
|
||||
// chunk size of initialBufSize). nDst and nSrc are per-chunk (or
|
||||
// per-Transform-call) indexes, pDst and pSrc are overall indexes.
|
||||
nDst, nSrc := 0, 0
|
||||
pDst, pSrc := 0, 0
|
||||
|
||||
// pPrefix is the length of a common prefix: the first pPrefix bytes of the
|
||||
// result will equal the first pPrefix bytes of s. It is not guaranteed to
|
||||
// be the largest such value, but if pPrefix, len(result) and len(s) are
|
||||
// all equal after the final transform (i.e. calling Transform with atEOF
|
||||
// being true returned nil error) then we don't need to allocate a new
|
||||
// result string.
|
||||
pPrefix := 0
|
||||
for {
|
||||
// Invariant: pDst == pPrefix && pSrc == pPrefix.
|
||||
|
||||
n := copy(src, s[pSrc:])
|
||||
nDst, nSrc, err = t.Transform(dst, src[:n], pSrc+n == len(s))
|
||||
pDst += nDst
|
||||
pSrc += nSrc
|
||||
|
||||
// TODO: let transformers implement an optional Spanner interface, akin
|
||||
// to norm's QuickSpan. This would even allow us to avoid any allocation.
|
||||
if !bytes.Equal(dst[:nDst], src[:nSrc]) {
|
||||
break
|
||||
}
|
||||
pPrefix = pSrc
|
||||
if err == ErrShortDst {
|
||||
// A buffer can only be short if a transformer modifies its input.
|
||||
break
|
||||
} else if err == ErrShortSrc {
|
||||
if nSrc == 0 {
|
||||
// No progress was made.
|
||||
break
|
||||
}
|
||||
// Equal so far and !atEOF, so continue checking.
|
||||
} else if err != nil || pPrefix == len(s) {
|
||||
return string(s[:pPrefix]), pPrefix, err
|
||||
}
|
||||
}
|
||||
// Post-condition: pDst == pPrefix + nDst && pSrc == pPrefix + nSrc.
|
||||
|
||||
// We have transformed the first pSrc bytes of the input s to become pDst
|
||||
// transformed bytes. Those transformed bytes are discontiguous: the first
|
||||
// pPrefix of them equal s[:pPrefix] and the last nDst of them equal
|
||||
// dst[:nDst]. We copy them around, into a new dst buffer if necessary, so
|
||||
// that they become one contiguous slice: dst[:pDst].
|
||||
if pPrefix != 0 {
|
||||
newDst := dst
|
||||
if pDst > len(newDst) {
|
||||
newDst = make([]byte, len(s)+nDst-nSrc)
|
||||
}
|
||||
copy(newDst[pPrefix:pDst], dst[:nDst])
|
||||
copy(newDst[:pPrefix], s[:pPrefix])
|
||||
dst = newDst
|
||||
}
|
||||
|
||||
// Prevent duplicate Transform calls with atEOF being true at the end of
|
||||
// the input. Also return if we have an unrecoverable error.
|
||||
if (err == nil && pSrc == len(s)) ||
|
||||
(err != nil && err != ErrShortDst && err != ErrShortSrc) {
|
||||
return string(dst[:pDst]), pSrc, err
|
||||
}
|
||||
|
||||
// Transform the remaining input, growing dst and src buffers as necessary.
|
||||
for {
|
||||
n := copy(src, s[pSrc:])
|
||||
nDst, nSrc, err := t.Transform(dst[pDst:], src[:n], pSrc+n == len(s))
|
||||
pDst += nDst
|
||||
pSrc += nSrc
|
||||
|
||||
// If we got ErrShortDst or ErrShortSrc, do not grow as long as we can
|
||||
// make progress. This may avoid excessive allocations.
|
||||
if err == ErrShortDst {
|
||||
if nDst == 0 {
|
||||
dst = grow(dst, pDst)
|
||||
}
|
||||
} else if err == ErrShortSrc {
|
||||
if nSrc == 0 {
|
||||
src = grow(src, 0)
|
||||
}
|
||||
} else if err != nil || pSrc == len(s) {
|
||||
return string(dst[:pDst]), pSrc, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Bytes returns a new byte slice with the result of converting b[:n] using t,
|
||||
// where n <= len(b). If err == nil, n will be len(b). It calls Reset on t.
|
||||
func Bytes(t Transformer, b []byte) (result []byte, n int, err error) {
|
||||
return doAppend(t, 0, make([]byte, len(b)), b)
|
||||
}
|
||||
|
||||
// Append appends the result of converting src[:n] using t to dst, where
|
||||
// n <= len(src), If err == nil, n will be len(src). It calls Reset on t.
|
||||
func Append(t Transformer, dst, src []byte) (result []byte, n int, err error) {
|
||||
if len(dst) == cap(dst) {
|
||||
n := len(src) + len(dst) // It is okay for this to be 0.
|
||||
b := make([]byte, n)
|
||||
dst = b[:copy(b, dst)]
|
||||
}
|
||||
return doAppend(t, len(dst), dst[:cap(dst)], src)
|
||||
}
|
||||
|
||||
func doAppend(t Transformer, pDst int, dst, src []byte) (result []byte, n int, err error) {
|
||||
t.Reset()
|
||||
pSrc := 0
|
||||
for {
|
||||
nDst, nSrc, err := t.Transform(dst[pDst:], src[pSrc:], true)
|
||||
pDst += nDst
|
||||
pSrc += nSrc
|
||||
if err != ErrShortDst {
|
||||
return dst[:pDst], pSrc, err
|
||||
}
|
||||
|
||||
// Grow the destination buffer, but do not grow as long as we can make
|
||||
// progress. This may avoid excessive allocations.
|
||||
if nDst == 0 {
|
||||
dst = grow(dst, pDst)
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user