diff --git a/bridge/slack/helpers.go b/bridge/slack/helpers.go index 5afd526b..8508d152 100644 --- a/bridge/slack/helpers.go +++ b/bridge/slack/helpers.go @@ -124,7 +124,6 @@ func (b *Bslack) replaceMention(text string) string { } return match } - b.Log.Debugf("FOUND USERNAME") return mentionRE.ReplaceAllStringFunc(text, replaceFunc) } diff --git a/gateway/translation.go b/gateway/translation.go index 2c265229..0f31c14f 100644 --- a/gateway/translation.go +++ b/gateway/translation.go @@ -19,44 +19,72 @@ import ( "cloud.google.com/go/translate" ) +var ( + uriRE = regexp.MustCompile(`(((http(s)?(\:\/\/))+(www\.)?([\w\-\.\/])*(\.[a-zA-Z]{2,3}\/?))[^\s\n|]*[^.,;:\?\!\@\^\$ -])`) + usernameRE = regexp.MustCompile(`(@[a-zA-Z0-9-]+)`) + channelsRE = regexp.MustCompile(`(#[a-zA-Z0-9-]+)`) + bugfixRE = regexp.MustCompile(`(:)([ $])`) + bugfixUndoRE = regexp.MustCompile(`(ː)([ $])`) +) + func (gw *Gateway) translationEnabled() bool { return gw.Router.GTClient != nil } -func (gw *Gateway) handleTranslation(msg *config.Message, dest *bridge.Bridge, channel config.ChannelInfo) { - // Skip if channel locale not set - if channel.Options.Locale == "" { - return - } - - // Don't try to translate empty messages - if msg.OrigMsg.Text == "" { - return - } - - msg.IsTranslation = true - ctx := context.Background() - - client := gw.Router.GTClient - defer client.Close() - - text := msg.Text - var results [][]string - - // colons: add temp token - // This is an ugly hack to work around what seems to be a bug in the Google Translate API. - // See: https://github.com/42wim/matterbridge/pull/512#issuecomment-428910199 - text = regexp.MustCompile(`(:)([ $])`).ReplaceAllString(text, "ː$2") - - // url - url_re := regexp.MustCompile(`(((http(s)?(\:\/\/))+(www\.)?([\w\-\.\/])*(\.[a-zA-Z]{2,3}\/?))[^\s\n|]*[^.,;:\?\!\@\^\$ -])`) - text = url_re.ReplaceAllString(text, "$0") - - flog.Debugf("pre-parseMD:"+text) +func protectUrls(text string) string { + return uriRE.ReplaceAllString(text, "$0") +} +func protectBullets(text string) string { // Get rid of these wierdo bullets that Slack uses, which confuse translation - text = strings.Replace(text, "•", "-", -1) + return strings.Replace(text, "•", "-", -1) +} +func protectUsernames(text string) string { + // @usernames + results := usernameRE.FindAllStringSubmatch(text, -1) + + // Sort so that longest channel names are acted on first + sort.SliceStable(results, func(i, j int) bool { + return len(results[i][1]) > len(results[j][1]) + }) + for _, r := range results { + text = regexp.MustCompile(fmt.Sprintf(`([^>])(%s)`, r[1])).ReplaceAllString(text, "$1$2") + } + flog.Debugf("Post cleanup:usernames: " + text) + + return text +} + +func protectChannels(text string) string { + // #channels + results := channelsRE.FindAllStringSubmatch(text, -1) + // Sort so that longest channel names are acted on first + sort.SliceStable(results, func(i, j int) bool { + return len(results[i][1]) > len(results[j][1]) + }) + for _, r := range results { + // If a channel that's a substring of another channel (processed earlier) matches, it will abort due to the in front + text = regexp.MustCompile(fmt.Sprintf(`([^>])(%s)`, r[1])).ReplaceAllString(text, "$1$2") + } + flog.Debugf("Post cleanup:channels: " + text) + + return text + +} + +func protectEmoji(text string) string { + // :emoji: + text = regexp.MustCompile(`:[a-z0-9-_]+?:`).ReplaceAllString(text, "$0") + + // :emoji: codepoints, ie. 💎 + text = emoji.NewEmojiParser().ReplaceAllString(text, "$0") + + flog.Debugf("post cleanup:emojis:"+text) + return text +} + +func convertMarkdown2Html(text string) string { // Make sure we use closed
tags const htmlFlags = blackfriday.UseXHTML const extensions = blackfriday.HardLineBreak | @@ -70,64 +98,47 @@ func (gw *Gateway) handleTranslation(msg *config.Message, dest *bridge.Bridge, c blackfriday.WithExtensions(extensions), blackfriday.WithRenderer(renderer), } - output := blackfriday.Run([]byte(text), optList...) - text = string(output) - flog.Debugf("post-parseMD:"+string(output)) + text = string(blackfriday.Run([]byte(text), optList...)) + flog.Debugf("Post-md2html: " + text) - // @usernames - results = regexp.MustCompile(`(@[a-zA-Z0-9-]+)`).FindAllStringSubmatch(text, -1) - // Sort so that longest channel names are acted on first - sort.SliceStable(results, func(i, j int) bool { - return len(results[i][1]) > len(results[j][1]) - }) - for _, r := range results { - text = regexp.MustCompile(fmt.Sprintf(`([^>])(%s)`, r[1])).ReplaceAllString(text, "$1$2") - } + return text +} - flog.Debugf("post cleanup:usernames:"+text) +func (gw *Gateway) translateText(msg *config.Message, locale string) string { + ctx := context.Background() + text := msg.Text - // #channels - results = regexp.MustCompile(`(#[a-zA-Z0-9-]+)`).FindAllStringSubmatch(text, -1) - // Sort so that longest channel names are acted on first - sort.SliceStable(results, func(i, j int) bool { - return len(results[i][1]) > len(results[j][1]) - }) - for _, r := range results { - // If a channel that's a substring of another channel (processed earlier) matches, it will abort due to the in front - text = regexp.MustCompile(fmt.Sprintf(`([^>])(%s)`, r[1])).ReplaceAllString(text, "$1$2") - } - - flog.Debugf("post cleanup:channels:"+text) - - // :emoji: - text = regexp.MustCompile(`:[a-z0-9-_]+?:`).ReplaceAllString(text, "$0") - - // :emoji: codepoints, ie. 💎 - text = emoji.NewEmojiParser().ReplaceAllString(text, "$0") - - flog.Debugf("post cleanup:emojis:"+text) - - channelLang, err := language.Parse(channel.Options.Locale) + channelLang, err := language.Parse(locale) if err != nil { flog.Error(err) } + client := gw.Router.GTClient + defer client.Close() + resp, _ := client.Translate(ctx, []string{text}, channelLang, &translate.Options{ Format: "html", Model: "nmt", }) text = resp[0].Text - flog.Debugf("post-translate:"+text) + flog.Debugf("Post-translation: " + text) if resp[0].Source == channelLang { msg.IsTranslation = false - return } - // Add space buffer after html before stripping, or characters after tags get merged into urls or usernames - text = regexp.MustCompile(`.+?`).ReplaceAllString(text, " $0 ") + return text +} +func guardBugfix(text string) string { + // colons: add temp token + // This is an ugly hack to work around what seems to be a bug in the Google Translate API. + // See: https://github.com/42wim/matterbridge/pull/512#issuecomment-428910199 + return bugfixRE.ReplaceAllString(text, "ː$2") +} + +func stripHtml(text string) string { allowableTags := []string{ "p", "i", @@ -146,7 +157,12 @@ func (gw *Gateway) handleTranslation(msg *config.Message, dest *bridge.Bridge, c stripped, _ := htmltags.Strip(text, allowableTags, false) text = stripped.ToString() - flog.Debugf("post-strip:"+text) + flog.Debugf("Post-strip: " + text) + + return text +} + +func convertHtml2Markdown(text string) string { html2md.AddRule("del", &html2md.Rule{ Patterns: []string{"del"}, Replacement: func(innerHTML string, attrs []string) string { @@ -157,6 +173,7 @@ func (gw *Gateway) handleTranslation(msg *config.Message, dest *bridge.Bridge, c return "" }, }) + // Custom override for slackdown html2md.AddRule("b", &html2md.Rule{ Patterns: []string{"b", "strong"}, @@ -168,6 +185,7 @@ func (gw *Gateway) handleTranslation(msg *config.Message, dest *bridge.Bridge, c return "" }, }) + // Custom override of default code rule: // This converts multiline code tags to codeblocks html2md.AddRule("code", &html2md.Rule{ @@ -185,13 +203,49 @@ func (gw *Gateway) handleTranslation(msg *config.Message, dest *bridge.Bridge, c return "" }, }) - text = html2md.Convert(text) + text = html2md.Convert(text) + flog.Debugf("Post-html2md: " + text) + + return text +} + +func unguardBugfix(text string) string { // colons: revert temp token // See: previous comment on colons - text = regexp.MustCompile(`(ː)([ $])`).ReplaceAllString(text, ":$2") + return bugfixUndoRE.ReplaceAllString(text, ":$2") +} + +func (gw *Gateway) handleTranslation(msg *config.Message, dest *bridge.Bridge, channel config.ChannelInfo) { + // Skip if channel locale not set + if channel.Options.Locale == "" { + return + } + + // Don't try to translate empty messages + if msg.OrigMsg.Text == "" { + return + } + + msg.IsTranslation = true + + text := msg.Text + text = guardBugfix(text) + text = protectUrls(text) + text = protectBullets(text) + text = convertMarkdown2Html(text) + text = protectUsernames(text) + text = protectChannels(text) + text = protectEmoji(text) + text = gw.translateText(msg, channel.Options.Locale) + + // Add space buffer after html before stripping, or characters after tags get merged into urls or usernames + text = regexp.MustCompile(`.+?`).ReplaceAllString(text, " $0 ") + + text = stripHtml(text) + text = convertHtml2Markdown(text) + text = unguardBugfix(text) - flog.Debugf("post-MDconvert:"+text) text = html.UnescapeString(text) flog.Debugf("post-unescaped:"+text) @@ -199,6 +253,9 @@ func (gw *Gateway) handleTranslation(msg *config.Message, dest *bridge.Bridge, c // eg. messages with only emoji, links, or untranslatable gibberish if strings.ToLower(strings.Replace(text, " ", "", -1)) == strings.ToLower(strings.Replace(msg.Text, " ", "", -1)) { msg.IsTranslation = false + } + + if msg.IsTranslation == false { return }