2021-10-16 14:11:32 -07:00
|
|
|
package uniseg
|
|
|
|
|
|
|
|
import "unicode/utf8"
|
|
|
|
|
2024-05-24 14:08:09 -07:00
|
|
|
// Graphemes implements an iterator over Unicode grapheme clusters, or
|
|
|
|
// user-perceived characters. While iterating, it also provides information
|
|
|
|
// about word boundaries, sentence boundaries, line breaks, and monospace
|
|
|
|
// character widths.
|
2021-10-16 14:11:32 -07:00
|
|
|
//
|
2024-05-24 14:08:09 -07:00
|
|
|
// After constructing the class via [NewGraphemes] for a given string "str",
|
|
|
|
// [Graphemes.Next] is called for every grapheme cluster in a loop until it
|
|
|
|
// returns false. Inside the loop, information about the grapheme cluster as
|
|
|
|
// well as boundary information and character width is available via the various
|
|
|
|
// methods (see examples below).
|
2021-10-16 14:11:32 -07:00
|
|
|
//
|
2024-05-24 14:08:09 -07:00
|
|
|
// This class basically wraps the [StepString] parser and provides a convenient
|
|
|
|
// interface to it. If you are only interested in some parts of this package's
|
|
|
|
// functionality, using the specialized functions starting with "First" is
|
|
|
|
// almost always faster.
|
2021-10-16 14:11:32 -07:00
|
|
|
type Graphemes struct {
|
2024-05-24 14:08:09 -07:00
|
|
|
// The original string.
|
|
|
|
original string
|
|
|
|
|
|
|
|
// The remaining string to be parsed.
|
|
|
|
remaining string
|
2021-10-16 14:11:32 -07:00
|
|
|
|
2024-05-24 14:08:09 -07:00
|
|
|
// The current grapheme cluster.
|
|
|
|
cluster string
|
2021-10-16 14:11:32 -07:00
|
|
|
|
2024-05-24 14:08:09 -07:00
|
|
|
// The byte offset of the current grapheme cluster relative to the original
|
|
|
|
// string.
|
|
|
|
offset int
|
2021-10-16 14:11:32 -07:00
|
|
|
|
2024-05-24 14:08:09 -07:00
|
|
|
// The current boundary information of the [Step] parser.
|
|
|
|
boundaries int
|
2021-10-16 14:11:32 -07:00
|
|
|
|
2024-05-24 14:08:09 -07:00
|
|
|
// The current state of the [Step] parser.
|
2021-10-16 14:11:32 -07:00
|
|
|
state int
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewGraphemes returns a new grapheme cluster iterator.
|
2024-05-24 14:08:09 -07:00
|
|
|
func NewGraphemes(str string) *Graphemes {
|
|
|
|
return &Graphemes{
|
|
|
|
original: str,
|
|
|
|
remaining: str,
|
|
|
|
state: -1,
|
2021-10-16 14:11:32 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Next advances the iterator by one grapheme cluster and returns false if no
|
|
|
|
// clusters are left. This function must be called before the first cluster is
|
|
|
|
// accessed.
|
|
|
|
func (g *Graphemes) Next() bool {
|
2024-05-24 14:08:09 -07:00
|
|
|
if len(g.remaining) == 0 {
|
|
|
|
// We're already past the end.
|
|
|
|
g.state = -2
|
|
|
|
g.cluster = ""
|
|
|
|
return false
|
2021-10-16 14:11:32 -07:00
|
|
|
}
|
2024-05-24 14:08:09 -07:00
|
|
|
g.offset += len(g.cluster)
|
|
|
|
g.cluster, g.remaining, g.boundaries, g.state = StepString(g.remaining, g.state)
|
|
|
|
return true
|
2021-10-16 14:11:32 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
// Runes returns a slice of runes (code points) which corresponds to the current
|
2024-05-24 14:08:09 -07:00
|
|
|
// grapheme cluster. If the iterator is already past the end or [Graphemes.Next]
|
|
|
|
// has not yet been called, nil is returned.
|
2021-10-16 14:11:32 -07:00
|
|
|
func (g *Graphemes) Runes() []rune {
|
2024-05-24 14:08:09 -07:00
|
|
|
if g.state < 0 {
|
2021-10-16 14:11:32 -07:00
|
|
|
return nil
|
|
|
|
}
|
2024-05-24 14:08:09 -07:00
|
|
|
return []rune(g.cluster)
|
2021-10-16 14:11:32 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
// Str returns a substring of the original string which corresponds to the
|
2024-05-24 14:08:09 -07:00
|
|
|
// current grapheme cluster. If the iterator is already past the end or
|
|
|
|
// [Graphemes.Next] has not yet been called, an empty string is returned.
|
2021-10-16 14:11:32 -07:00
|
|
|
func (g *Graphemes) Str() string {
|
2024-05-24 14:08:09 -07:00
|
|
|
return g.cluster
|
2021-10-16 14:11:32 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
// Bytes returns a byte slice which corresponds to the current grapheme cluster.
|
2024-05-24 14:08:09 -07:00
|
|
|
// If the iterator is already past the end or [Graphemes.Next] has not yet been
|
|
|
|
// called, nil is returned.
|
2021-10-16 14:11:32 -07:00
|
|
|
func (g *Graphemes) Bytes() []byte {
|
2024-05-24 14:08:09 -07:00
|
|
|
if g.state < 0 {
|
2021-10-16 14:11:32 -07:00
|
|
|
return nil
|
|
|
|
}
|
2024-05-24 14:08:09 -07:00
|
|
|
return []byte(g.cluster)
|
2021-10-16 14:11:32 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
// Positions returns the interval of the current grapheme cluster as byte
|
|
|
|
// positions into the original string. The first returned value "from" indexes
|
|
|
|
// the first byte and the second returned value "to" indexes the first byte that
|
|
|
|
// is not included anymore, i.e. str[from:to] is the current grapheme cluster of
|
2024-05-24 14:08:09 -07:00
|
|
|
// the original string "str". If [Graphemes.Next] has not yet been called, both
|
|
|
|
// values are 0. If the iterator is already past the end, both values are 1.
|
2021-10-16 14:11:32 -07:00
|
|
|
func (g *Graphemes) Positions() (int, int) {
|
2024-05-24 14:08:09 -07:00
|
|
|
if g.state == -1 {
|
|
|
|
return 0, 0
|
|
|
|
} else if g.state == -2 {
|
|
|
|
return 1, 1
|
|
|
|
}
|
|
|
|
return g.offset, g.offset + len(g.cluster)
|
|
|
|
}
|
|
|
|
|
|
|
|
// IsWordBoundary returns true if a word ends after the current grapheme
|
|
|
|
// cluster.
|
|
|
|
func (g *Graphemes) IsWordBoundary() bool {
|
|
|
|
if g.state < 0 {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
return g.boundaries&MaskWord != 0
|
|
|
|
}
|
|
|
|
|
|
|
|
// IsSentenceBoundary returns true if a sentence ends after the current
|
|
|
|
// grapheme cluster.
|
|
|
|
func (g *Graphemes) IsSentenceBoundary() bool {
|
|
|
|
if g.state < 0 {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
return g.boundaries&MaskSentence != 0
|
|
|
|
}
|
|
|
|
|
|
|
|
// LineBreak returns whether the line can be broken after the current grapheme
|
|
|
|
// cluster. A value of [LineDontBreak] means the line may not be broken, a value
|
|
|
|
// of [LineMustBreak] means the line must be broken, and a value of
|
|
|
|
// [LineCanBreak] means the line may or may not be broken.
|
|
|
|
func (g *Graphemes) LineBreak() int {
|
|
|
|
if g.state == -1 {
|
|
|
|
return LineDontBreak
|
|
|
|
}
|
|
|
|
if g.state == -2 {
|
|
|
|
return LineMustBreak
|
|
|
|
}
|
|
|
|
return g.boundaries & MaskLine
|
|
|
|
}
|
|
|
|
|
|
|
|
// Width returns the monospace width of the current grapheme cluster.
|
|
|
|
func (g *Graphemes) Width() int {
|
|
|
|
if g.state < 0 {
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
return g.boundaries >> ShiftWidth
|
2021-10-16 14:11:32 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
// Reset puts the iterator into its initial state such that the next call to
|
2024-05-24 14:08:09 -07:00
|
|
|
// [Graphemes.Next] sets it to the first grapheme cluster again.
|
2021-10-16 14:11:32 -07:00
|
|
|
func (g *Graphemes) Reset() {
|
2024-05-24 14:08:09 -07:00
|
|
|
g.state = -1
|
|
|
|
g.offset = 0
|
|
|
|
g.cluster = ""
|
|
|
|
g.remaining = g.original
|
2021-10-16 14:11:32 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
// GraphemeClusterCount returns the number of user-perceived characters
|
2024-05-24 14:08:09 -07:00
|
|
|
// (grapheme clusters) for the given string.
|
2021-10-16 14:11:32 -07:00
|
|
|
func GraphemeClusterCount(s string) (n int) {
|
2024-05-24 14:08:09 -07:00
|
|
|
state := -1
|
|
|
|
for len(s) > 0 {
|
|
|
|
_, s, _, state = FirstGraphemeClusterInString(s, state)
|
2021-10-16 14:11:32 -07:00
|
|
|
n++
|
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
2024-05-24 14:08:09 -07:00
|
|
|
|
|
|
|
// ReverseString reverses the given string while observing grapheme cluster
|
|
|
|
// boundaries.
|
|
|
|
func ReverseString(s string) string {
|
|
|
|
str := []byte(s)
|
|
|
|
reversed := make([]byte, len(str))
|
|
|
|
state := -1
|
|
|
|
index := len(str)
|
|
|
|
for len(str) > 0 {
|
|
|
|
var cluster []byte
|
|
|
|
cluster, str, _, state = FirstGraphemeCluster(str, state)
|
|
|
|
index -= len(cluster)
|
|
|
|
copy(reversed[index:], cluster)
|
|
|
|
if index <= len(str)/2 {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return string(reversed)
|
|
|
|
}
|
|
|
|
|
|
|
|
// The number of bits the grapheme property must be shifted to make place for
|
|
|
|
// grapheme states.
|
|
|
|
const shiftGraphemePropState = 4
|
|
|
|
|
|
|
|
// FirstGraphemeCluster returns the first grapheme cluster found in the given
|
|
|
|
// byte slice according to the rules of [Unicode Standard Annex #29, Grapheme
|
|
|
|
// Cluster Boundaries]. This function can be called continuously to extract all
|
|
|
|
// grapheme clusters from a byte slice, as illustrated in the example below.
|
|
|
|
//
|
|
|
|
// If you don't know the current state, for example when calling the function
|
|
|
|
// for the first time, you must pass -1. For consecutive calls, pass the state
|
|
|
|
// and rest slice returned by the previous call.
|
|
|
|
//
|
|
|
|
// The "rest" slice is the sub-slice of the original byte slice "b" starting
|
|
|
|
// after the last byte of the identified grapheme cluster. If the length of the
|
|
|
|
// "rest" slice is 0, the entire byte slice "b" has been processed. The
|
|
|
|
// "cluster" byte slice is the sub-slice of the input slice containing the
|
|
|
|
// identified grapheme cluster.
|
|
|
|
//
|
|
|
|
// The returned width is the width of the grapheme cluster for most monospace
|
|
|
|
// fonts where a value of 1 represents one character cell.
|
|
|
|
//
|
|
|
|
// Given an empty byte slice "b", the function returns nil values.
|
|
|
|
//
|
|
|
|
// While slightly less convenient than using the Graphemes class, this function
|
|
|
|
// has much better performance and makes no allocations. It lends itself well to
|
|
|
|
// large byte slices.
|
|
|
|
//
|
|
|
|
// [Unicode Standard Annex #29, Grapheme Cluster Boundaries]: http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
|
|
|
|
func FirstGraphemeCluster(b []byte, state int) (cluster, rest []byte, width, newState int) {
|
|
|
|
// An empty byte slice returns nothing.
|
|
|
|
if len(b) == 0 {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// Extract the first rune.
|
|
|
|
r, length := utf8.DecodeRune(b)
|
|
|
|
if len(b) <= length { // If we're already past the end, there is nothing else to parse.
|
|
|
|
var prop int
|
|
|
|
if state < 0 {
|
|
|
|
prop = propertyGraphemes(r)
|
|
|
|
} else {
|
|
|
|
prop = state >> shiftGraphemePropState
|
|
|
|
}
|
|
|
|
return b, nil, runeWidth(r, prop), grAny | (prop << shiftGraphemePropState)
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we don't know the state, determine it now.
|
|
|
|
var firstProp int
|
|
|
|
if state < 0 {
|
|
|
|
state, firstProp, _ = transitionGraphemeState(state, r)
|
|
|
|
} else {
|
|
|
|
firstProp = state >> shiftGraphemePropState
|
|
|
|
}
|
|
|
|
width += runeWidth(r, firstProp)
|
|
|
|
|
|
|
|
// Transition until we find a boundary.
|
|
|
|
for {
|
|
|
|
var (
|
|
|
|
prop int
|
|
|
|
boundary bool
|
|
|
|
)
|
|
|
|
|
|
|
|
r, l := utf8.DecodeRune(b[length:])
|
|
|
|
state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r)
|
|
|
|
|
|
|
|
if boundary {
|
|
|
|
return b[:length], b[length:], width, state | (prop << shiftGraphemePropState)
|
|
|
|
}
|
|
|
|
|
|
|
|
if firstProp == prExtendedPictographic {
|
|
|
|
if r == vs15 {
|
|
|
|
width = 1
|
|
|
|
} else if r == vs16 {
|
|
|
|
width = 2
|
|
|
|
}
|
|
|
|
} else if firstProp != prRegionalIndicator && firstProp != prL {
|
|
|
|
width += runeWidth(r, prop)
|
|
|
|
}
|
|
|
|
|
|
|
|
length += l
|
|
|
|
if len(b) <= length {
|
|
|
|
return b, nil, width, grAny | (prop << shiftGraphemePropState)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// FirstGraphemeClusterInString is like [FirstGraphemeCluster] but its input and
|
|
|
|
// outputs are strings.
|
|
|
|
func FirstGraphemeClusterInString(str string, state int) (cluster, rest string, width, newState int) {
|
|
|
|
// An empty string returns nothing.
|
|
|
|
if len(str) == 0 {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// Extract the first rune.
|
|
|
|
r, length := utf8.DecodeRuneInString(str)
|
|
|
|
if len(str) <= length { // If we're already past the end, there is nothing else to parse.
|
|
|
|
var prop int
|
|
|
|
if state < 0 {
|
|
|
|
prop = propertyGraphemes(r)
|
|
|
|
} else {
|
|
|
|
prop = state >> shiftGraphemePropState
|
|
|
|
}
|
|
|
|
return str, "", runeWidth(r, prop), grAny | (prop << shiftGraphemePropState)
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we don't know the state, determine it now.
|
|
|
|
var firstProp int
|
|
|
|
if state < 0 {
|
|
|
|
state, firstProp, _ = transitionGraphemeState(state, r)
|
|
|
|
} else {
|
|
|
|
firstProp = state >> shiftGraphemePropState
|
|
|
|
}
|
|
|
|
width += runeWidth(r, firstProp)
|
|
|
|
|
|
|
|
// Transition until we find a boundary.
|
|
|
|
for {
|
|
|
|
var (
|
|
|
|
prop int
|
|
|
|
boundary bool
|
|
|
|
)
|
|
|
|
|
|
|
|
r, l := utf8.DecodeRuneInString(str[length:])
|
|
|
|
state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r)
|
|
|
|
|
|
|
|
if boundary {
|
|
|
|
return str[:length], str[length:], width, state | (prop << shiftGraphemePropState)
|
|
|
|
}
|
|
|
|
|
|
|
|
if firstProp == prExtendedPictographic {
|
|
|
|
if r == vs15 {
|
|
|
|
width = 1
|
|
|
|
} else if r == vs16 {
|
|
|
|
width = 2
|
|
|
|
}
|
|
|
|
} else if firstProp != prRegionalIndicator && firstProp != prL {
|
|
|
|
width += runeWidth(r, prop)
|
|
|
|
}
|
|
|
|
|
|
|
|
length += l
|
|
|
|
if len(str) <= length {
|
|
|
|
return str, "", width, grAny | (prop << shiftGraphemePropState)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|