Correctly split three-or-more byte sequences of UTF-8 (#2123)

This commit is contained in:
Ben Wiederhake 2024-05-24 00:02:09 +02:00 committed by GitHub
parent 6b528ffa4f
commit d055b4530e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 21 additions and 2 deletions

View File

@ -211,8 +211,18 @@ func ClipMessage(text string, length int, clippingMessage string) string {
if len(text) > length { if len(text) > length {
text = text[:length-len(clippingMessage)] text = text[:length-len(clippingMessage)]
if r, size := utf8.DecodeLastRuneInString(text); r == utf8.RuneError { for len(text) > 0 {
text = text[:len(text)-size] if r, _ := utf8.DecodeLastRuneInString(text); r == utf8.RuneError {
text = text[:len(text)-1]
// Note: DecodeLastRuneInString only returns the constant value "1" in
// case of an error. We do not yet know whether the last rune is now
// actually valid. Example: "€" is 0xE2 0x82 0xAC. If we happen to split
// the string just before 0xAC, and go back only one byte, that would
// leave us with a string that ends in the byte 0xE2, which is not a valid
// rune, so we need to try again.
} else {
break
}
} }
text += clippingMessage text += clippingMessage
} }

View File

@ -88,6 +88,15 @@ var lineSplittingTestCases = map[string]struct {
}, },
nonSplitOutput: []string{"不布人個我此而及單石業喜資富下我河下日沒一我臺空達的常景便物沒為……子大我別名解成?生賣的全直黑,我自我結毛分洲了世當,是政福那是東;斯說"}, nonSplitOutput: []string{"不布人個我此而及單石業喜資富下我河下日沒一我臺空達的常景便物沒為……子大我別名解成?生賣的全直黑,我自我結毛分洲了世當,是政福那是東;斯說"},
}, },
"Long message, clip three-byte rune after two bytes": {
input: "x 人人生而自由,在尊嚴和權利上一律平等。 他們都具有理性和良知,應該以兄弟情誼的精神對待彼此。",
splitOutput: []string{
"x 人人生而自由,在尊嚴和權利上 <clipped message>",
"一律平等。 他們都具有理性和良知 <clipped message>",
",應該以兄弟情誼的精神對待彼此。",
},
nonSplitOutput: []string{"x 人人生而自由,在尊嚴和權利上一律平等。 他們都具有理性和良知,應該以兄弟情誼的精神對待彼此。"},
},
} }
func TestGetSubLines(t *testing.T) { func TestGetSubLines(t *testing.T) {