Pulled translation code into separate funcs.

This commit is contained in:
Patrick Connolly
2018-10-25 22:16:47 +08:00
parent e62b9b4f65
commit 48d72522ed
2 changed files with 131 additions and 75 deletions

View File

@@ -124,7 +124,6 @@ func (b *Bslack) replaceMention(text string) string {
}
return match
}
b.Log.Debugf("FOUND USERNAME")
return mentionRE.ReplaceAllStringFunc(text, replaceFunc)
}

View File

@@ -19,44 +19,72 @@ import (
"cloud.google.com/go/translate"
)
var (
uriRE = regexp.MustCompile(`(((http(s)?(\:\/\/))+(www\.)?([\w\-\.\/])*(\.[a-zA-Z]{2,3}\/?))[^\s\n|]*[^.,;:\?\!\@\^\$ -])`)
usernameRE = regexp.MustCompile(`(@[a-zA-Z0-9-]+)`)
channelsRE = regexp.MustCompile(`(#[a-zA-Z0-9-]+)`)
bugfixRE = regexp.MustCompile(`(:)([ $])`)
bugfixUndoRE = regexp.MustCompile(`(ː)([ $])`)
)
func (gw *Gateway) translationEnabled() bool {
return gw.Router.GTClient != nil
}
func (gw *Gateway) handleTranslation(msg *config.Message, dest *bridge.Bridge, channel config.ChannelInfo) {
// Skip if channel locale not set
if channel.Options.Locale == "" {
return
}
// Don't try to translate empty messages
if msg.OrigMsg.Text == "" {
return
}
msg.IsTranslation = true
ctx := context.Background()
client := gw.Router.GTClient
defer client.Close()
text := msg.Text
var results [][]string
// colons: add temp token
// This is an ugly hack to work around what seems to be a bug in the Google Translate API.
// See: https://github.com/42wim/matterbridge/pull/512#issuecomment-428910199
text = regexp.MustCompile(`(:)([ $])`).ReplaceAllString(text, "<span translate='no'>ː$2</span>")
// url
url_re := regexp.MustCompile(`(((http(s)?(\:\/\/))+(www\.)?([\w\-\.\/])*(\.[a-zA-Z]{2,3}\/?))[^\s\n|]*[^.,;:\?\!\@\^\$ -])`)
text = url_re.ReplaceAllString(text, "<span translate='no'>$0</span>")
flog.Debugf("pre-parseMD:"+text)
func protectUrls(text string) string {
return uriRE.ReplaceAllString(text, "<span translate='no'>$0</span>")
}
func protectBullets(text string) string {
// Get rid of these wierdo bullets that Slack uses, which confuse translation
text = strings.Replace(text, "•", "-", -1)
return strings.Replace(text, "•", "-", -1)
}
func protectUsernames(text string) string {
// @usernames
results := usernameRE.FindAllStringSubmatch(text, -1)
// Sort so that longest channel names are acted on first
sort.SliceStable(results, func(i, j int) bool {
return len(results[i][1]) > len(results[j][1])
})
for _, r := range results {
text = regexp.MustCompile(fmt.Sprintf(`([^>])(%s)`, r[1])).ReplaceAllString(text, "$1<span translate='no'>$2</span>")
}
flog.Debugf("Post cleanup:usernames: " + text)
return text
}
func protectChannels(text string) string {
// #channels
results := channelsRE.FindAllStringSubmatch(text, -1)
// Sort so that longest channel names are acted on first
sort.SliceStable(results, func(i, j int) bool {
return len(results[i][1]) > len(results[j][1])
})
for _, r := range results {
// If a channel that's a substring of another channel (processed earlier) matches, it will abort due to the <tag> in front
text = regexp.MustCompile(fmt.Sprintf(`([^>])(%s)`, r[1])).ReplaceAllString(text, "$1<span translate='no'>$2</span>")
}
flog.Debugf("Post cleanup:channels: " + text)
return text
}
func protectEmoji(text string) string {
// :emoji:
text = regexp.MustCompile(`:[a-z0-9-_]+?:`).ReplaceAllString(text, "<span translate='no'>$0</span>")
// :emoji: codepoints, ie. 💎
text = emoji.NewEmojiParser().ReplaceAllString(text, "<span translate='no'>$0</span>")
flog.Debugf("post cleanup:emojis:"+text)
return text
}
func convertMarkdown2Html(text string) string {
// Make sure we use closed <br/> tags
const htmlFlags = blackfriday.UseXHTML
const extensions = blackfriday.HardLineBreak |
@@ -70,64 +98,47 @@ func (gw *Gateway) handleTranslation(msg *config.Message, dest *bridge.Bridge, c
blackfriday.WithExtensions(extensions),
blackfriday.WithRenderer(renderer),
}
output := blackfriday.Run([]byte(text), optList...)
text = string(output)
flog.Debugf("post-parseMD:"+string(output))
text = string(blackfriday.Run([]byte(text), optList...))
flog.Debugf("Post-md2html: " + text)
// @usernames
results = regexp.MustCompile(`(@[a-zA-Z0-9-]+)`).FindAllStringSubmatch(text, -1)
// Sort so that longest channel names are acted on first
sort.SliceStable(results, func(i, j int) bool {
return len(results[i][1]) > len(results[j][1])
})
for _, r := range results {
text = regexp.MustCompile(fmt.Sprintf(`([^>])(%s)`, r[1])).ReplaceAllString(text, "$1<span translate='no'>$2</span>")
}
return text
}
flog.Debugf("post cleanup:usernames:"+text)
func (gw *Gateway) translateText(msg *config.Message, locale string) string {
ctx := context.Background()
text := msg.Text
// #channels
results = regexp.MustCompile(`(#[a-zA-Z0-9-]+)`).FindAllStringSubmatch(text, -1)
// Sort so that longest channel names are acted on first
sort.SliceStable(results, func(i, j int) bool {
return len(results[i][1]) > len(results[j][1])
})
for _, r := range results {
// If a channel that's a substring of another channel (processed earlier) matches, it will abort due to the <tag> in front
text = regexp.MustCompile(fmt.Sprintf(`([^>])(%s)`, r[1])).ReplaceAllString(text, "$1<span translate='no'>$2</span>")
}
flog.Debugf("post cleanup:channels:"+text)
// :emoji:
text = regexp.MustCompile(`:[a-z0-9-_]+?:`).ReplaceAllString(text, "<span translate='no'>$0</span>")
// :emoji: codepoints, ie. 💎
text = emoji.NewEmojiParser().ReplaceAllString(text, "<span translate='no'>$0</span>")
flog.Debugf("post cleanup:emojis:"+text)
channelLang, err := language.Parse(channel.Options.Locale)
channelLang, err := language.Parse(locale)
if err != nil {
flog.Error(err)
}
client := gw.Router.GTClient
defer client.Close()
resp, _ := client.Translate(ctx, []string{text}, channelLang, &translate.Options{
Format: "html",
Model: "nmt",
})
text = resp[0].Text
flog.Debugf("post-translate:"+text)
flog.Debugf("Post-translation: " + text)
if resp[0].Source == channelLang {
msg.IsTranslation = false
return
}
// Add space buffer after html <span> before stripping, or characters after tags get merged into urls or usernames
text = regexp.MustCompile(`<span translate='no'>.+?</span>`).ReplaceAllString(text, " $0 ")
return text
}
func guardBugfix(text string) string {
// colons: add temp token
// This is an ugly hack to work around what seems to be a bug in the Google Translate API.
// See: https://github.com/42wim/matterbridge/pull/512#issuecomment-428910199
return bugfixRE.ReplaceAllString(text, "<span translate='no'>ː$2</span>")
}
func stripHtml(text string) string {
allowableTags := []string{
"p",
"i",
@@ -146,7 +157,12 @@ func (gw *Gateway) handleTranslation(msg *config.Message, dest *bridge.Bridge, c
stripped, _ := htmltags.Strip(text, allowableTags, false)
text = stripped.ToString()
flog.Debugf("post-strip:"+text)
flog.Debugf("Post-strip: " + text)
return text
}
func convertHtml2Markdown(text string) string {
html2md.AddRule("del", &html2md.Rule{
Patterns: []string{"del"},
Replacement: func(innerHTML string, attrs []string) string {
@@ -157,6 +173,7 @@ func (gw *Gateway) handleTranslation(msg *config.Message, dest *bridge.Bridge, c
return ""
},
})
// Custom override for slackdown
html2md.AddRule("b", &html2md.Rule{
Patterns: []string{"b", "strong"},
@@ -168,6 +185,7 @@ func (gw *Gateway) handleTranslation(msg *config.Message, dest *bridge.Bridge, c
return ""
},
})
// Custom override of default code rule:
// This converts multiline code tags to codeblocks
html2md.AddRule("code", &html2md.Rule{
@@ -185,13 +203,49 @@ func (gw *Gateway) handleTranslation(msg *config.Message, dest *bridge.Bridge, c
return ""
},
})
text = html2md.Convert(text)
text = html2md.Convert(text)
flog.Debugf("Post-html2md: " + text)
return text
}
func unguardBugfix(text string) string {
// colons: revert temp token
// See: previous comment on colons
text = regexp.MustCompile(`(ː)([ $])`).ReplaceAllString(text, ":$2")
return bugfixUndoRE.ReplaceAllString(text, ":$2")
}
func (gw *Gateway) handleTranslation(msg *config.Message, dest *bridge.Bridge, channel config.ChannelInfo) {
// Skip if channel locale not set
if channel.Options.Locale == "" {
return
}
// Don't try to translate empty messages
if msg.OrigMsg.Text == "" {
return
}
msg.IsTranslation = true
text := msg.Text
text = guardBugfix(text)
text = protectUrls(text)
text = protectBullets(text)
text = convertMarkdown2Html(text)
text = protectUsernames(text)
text = protectChannels(text)
text = protectEmoji(text)
text = gw.translateText(msg, channel.Options.Locale)
// Add space buffer after html <span> before stripping, or characters after tags get merged into urls or usernames
text = regexp.MustCompile(`<span translate='no'>.+?</span>`).ReplaceAllString(text, " $0 ")
text = stripHtml(text)
text = convertHtml2Markdown(text)
text = unguardBugfix(text)
flog.Debugf("post-MDconvert:"+text)
text = html.UnescapeString(text)
flog.Debugf("post-unescaped:"+text)
@@ -199,6 +253,9 @@ func (gw *Gateway) handleTranslation(msg *config.Message, dest *bridge.Bridge, c
// eg. messages with only emoji, links, or untranslatable gibberish
if strings.ToLower(strings.Replace(text, " ", "", -1)) == strings.ToLower(strings.Replace(msg.Text, " ", "", -1)) {
msg.IsTranslation = false
}
if msg.IsTranslation == false {
return
}