From ccadda3ec24ee735f2cb9de71aee9381d29d4d7e Mon Sep 17 00:00:00 2001 From: Patrick Connolly Date: Wed, 24 Oct 2018 01:10:10 +0800 Subject: [PATCH] Extracted translation code in separate files. --- gateway/blackfriday.go | 69 ++++++++++++ gateway/gateway.go | 248 +---------------------------------------- gateway/translation.go | 199 +++++++++++++++++++++++++++++++++ 3 files changed, 269 insertions(+), 247 deletions(-) create mode 100644 gateway/blackfriday.go create mode 100644 gateway/translation.go diff --git a/gateway/blackfriday.go b/gateway/blackfriday.go new file mode 100644 index 00000000..f76e529d --- /dev/null +++ b/gateway/blackfriday.go @@ -0,0 +1,69 @@ +package gateway + +import ( + "bytes" + "strings" + + "github.com/russross/blackfriday" +) + +type renderer struct { + *blackfriday.Html +} + +func doubleSpace(out *bytes.Buffer) { + if out.Len() > 0 { + out.WriteByte('\n') + } +} + +func escapeSingleChar(char byte) (string, bool) { + if char == '"' { + return """, true + } + if char == '&' { + return "&", true + } + if char == '<' { + return "<", true + } + if char == '>' { + return ">", true + } + return "", false +} + +func attrEscape(out *bytes.Buffer, src []byte) { + org := 0 + for i, ch := range src { + if entity, ok := escapeSingleChar(ch); ok { + if i > org { + // copy all the normal characters since the last escape + out.Write(src[org:i]) + } + org = i + 1 + out.WriteString(entity) + } + } + if org < len(src) { + out.Write(src[org:]) + } +} + +// Using rather than
 keeps Google Translate from trying to process it.
+// BUT it collapses code into one line for some reason, and 
 preserves newlines.
+// #TODO Investigating the 
 combo might work.
+func (*renderer) BlockCode(out *bytes.Buffer, text []byte, info string) {
+	doubleSpace(out)
+
+	endOfLang := strings.IndexAny(info, "\t ")
+	if endOfLang < 0 {
+		endOfLang = len(info)
+	}
+	lang := info[:endOfLang]
+	if len(lang) == 0 || lang == "." {
+		out.WriteString("
")
+	}
+	attrEscape(out, text)
+	out.WriteString("
\n") +} diff --git a/gateway/gateway.go b/gateway/gateway.go index 5d54f541..1b62e9b8 100644 --- a/gateway/gateway.go +++ b/gateway/gateway.go @@ -6,12 +6,6 @@ import ( "io/ioutil" "net/http" "os" - "context" - "html" - "sort" - "github.com/darkoatanasovski/htmltags" - "github.com/urakozz/go-emoji" - "github.com/lunny/html2md" "github.com/42wim/matterbridge/bridge" "github.com/42wim/matterbridge/bridge/api" @@ -29,11 +23,8 @@ import ( bxmpp "github.com/42wim/matterbridge/bridge/xmpp" bzulip "github.com/42wim/matterbridge/bridge/zulip" "github.com/hashicorp/golang-lru" - "github.com/russross/blackfriday" log "github.com/sirupsen/logrus" // "github.com/davecgh/go-spew/spew" - "cloud.google.com/go/translate" - "golang.org/x/text/language" "crypto/sha1" "path/filepath" "regexp" @@ -215,242 +206,6 @@ func (gw *Gateway) getDestChannel(msg *config.Message, dest bridge.Bridge) []con return channels } -type renderer struct { - *blackfriday.Html -} - -func doubleSpace(out *bytes.Buffer) { - if out.Len() > 0 { - out.WriteByte('\n') - } -} - -func escapeSingleChar(char byte) (string, bool) { - if char == '"' { - return """, true - } - if char == '&' { - return "&", true - } - if char == '<' { - return "<", true - } - if char == '>' { - return ">", true - } - return "", false -} - -func attrEscape(out *bytes.Buffer, src []byte) { - org := 0 - for i, ch := range src { - if entity, ok := escapeSingleChar(ch); ok { - if i > org { - // copy all the normal characters since the last escape - out.Write(src[org:i]) - } - org = i + 1 - out.WriteString(entity) - } - } - if org < len(src) { - out.Write(src[org:]) - } -} - -// Using rather than
 keeps Google Translate from trying to process it.
-// BUT it collapses code into one line for some reason, and 
 preserves newlines.
-// #TODO Investigating the 
 combo might work.
-func (*renderer) BlockCode(out *bytes.Buffer, text []byte, info string) {
-	doubleSpace(out)
-
-	endOfLang := strings.IndexAny(info, "\t ")
-	if endOfLang < 0 {
-		endOfLang = len(info)
-	}
-	lang := info[:endOfLang]
-	if len(lang) == 0 || lang == "." {
-		out.WriteString("
")
-	}
-	attrEscape(out, text)
-	out.WriteString("
\n") -} - -func (gw *Gateway) handleTranslation(msg *config.Message, dest *bridge.Bridge, channel config.ChannelInfo) { - // Skip if channel locale not set - if channel.Options.Locale == "" { - return - } - - // Don't try to translate empty messages - if msg.OrigMsg.Text == "" { - return - } - - msg.IsTranslation = true - ctx := context.Background() - - client := gw.Router.GTClient - defer client.Close() - - text := msg.Text - var results [][]string - - // colons: add temp token - // This is an ugly hack to work around what seems to be a bug in the Google Translate API. - // See: https://github.com/42wim/matterbridge/pull/512#issuecomment-428910199 - text = regexp.MustCompile(`(:)([ $])`).ReplaceAllString(text, "ː$2") - - // url - url_re := regexp.MustCompile(`(((http(s)?(\:\/\/))+(www\.)?([\w\-\.\/])*(\.[a-zA-Z]{2,3}\/?))[^\s\n|]*[^.,;:\?\!\@\^\$ -])`) - text = url_re.ReplaceAllString(text, "$0") - - flog.Debugf("pre-parseMD:"+text) - - // Get rid of these wierdo bullets that Slack uses, which confuse translation - text = strings.Replace(text, "•", "-", -1) - - // Make sure we use closed
tags - const htmlFlags = blackfriday.HTML_USE_XHTML - renderer := &renderer{Html: blackfriday.HtmlRenderer(htmlFlags, "", "").(*blackfriday.Html)} - const extensions = blackfriday.LINK_TYPE_NOT_AUTOLINK | - blackfriday.EXTENSION_HARD_LINE_BREAK | - blackfriday.EXTENSION_STRIKETHROUGH | - blackfriday.EXTENSION_FENCED_CODE | - blackfriday.EXTENSION_HARD_LINE_BREAK - output := blackfriday.Markdown([]byte(text), renderer, extensions) - text = string(output) - flog.Debugf("post-parseMD:"+string(output)) - - // @usernames - results = regexp.MustCompile(`(@[a-zA-Z0-9-]+)`).FindAllStringSubmatch(text, -1) - // Sort so that longest channel names are acted on first - sort.SliceStable(results, func(i, j int) bool { - return len(results[i][1]) > len(results[j][1]) - }) - for _, r := range results { - text = regexp.MustCompile(fmt.Sprintf(`([^>])(%s)`, r[1])).ReplaceAllString(text, "$1$2") - } - - flog.Debugf("post cleanup:usernames:"+text) - - // #channels - results = regexp.MustCompile(`(#[a-zA-Z0-9-]+)`).FindAllStringSubmatch(text, -1) - // Sort so that longest channel names are acted on first - sort.SliceStable(results, func(i, j int) bool { - return len(results[i][1]) > len(results[j][1]) - }) - for _, r := range results { - // If a channel that's a substring of another channel (processed earlier) matches, it will abort due to the in front - text = regexp.MustCompile(fmt.Sprintf(`([^>])(%s)`, r[1])).ReplaceAllString(text, "$1$2") - } - - flog.Debugf("post cleanup:channels:"+text) - - // :emoji: - text = regexp.MustCompile(`:[a-z0-9-_]+?:`).ReplaceAllString(text, "$0") - - // :emoji: codepoints, ie. 💎 - text = emoji.NewEmojiParser().ReplaceAllString(text, "$0") - - flog.Debugf("post cleanup:emojis:"+text) - - channelLang, err := language.Parse(channel.Options.Locale) - if err != nil { - flog.Error(err) - } - - resp, _ := client.Translate(ctx, []string{text}, channelLang, &translate.Options{ - Format: "html", - }) - - text = resp[0].Text - flog.Debugf("post-translate:"+text) - - if resp[0].Source == channelLang { - msg.IsTranslation = false - return - } - - // Add space buffer after html before stripping, or characters after tags get merged into urls or usernames - text = regexp.MustCompile(`.+?`).ReplaceAllString(text, " $0 ") - - allowableTags := []string{ - "p", - "i", - "b", - "em", - "strong", - "br", - "del", - "blockquote", - "pre", - "code", - "li", - "ul", - "ol", - } - - stripped, _ := htmltags.Strip(text, allowableTags, false) - text = stripped.ToString() - flog.Debugf("post-strip:"+text) - html2md.AddRule("del", &html2md.Rule{ - Patterns: []string{"del"}, - Replacement: func(innerHTML string, attrs []string) string { - if len(attrs) > 1 { - // Extra spaces so that Slack will process, even though Chinese characters don't get spaces - return html2md.WrapInlineTag(attrs[1], " ~", "~ ") - } - return "" - }, - }) - // Custom override for slackdown - html2md.AddRule("b", &html2md.Rule{ - Patterns: []string{"b", "strong"}, - Replacement: func(innerHTML string, attrs []string) string { - if len(attrs) > 1 { - // trailing whitespace due to Mandarin issues - return html2md.WrapInlineTag(attrs[1], "*", "* ") - } - return "" - }, - }) - // Custom override of default code rule: - // This converts multiline code tags to codeblocks - html2md.AddRule("code", &html2md.Rule{ - Patterns: []string{"code", "tt", "pre"}, - Replacement: func(innerHTML string, attrs []string) string { - contents := attrs[1] - if strings.Contains(contents, "\n") { - r := regexp.MustCompile(`/^\t+`) - innerHTML = r.ReplaceAllString(contents, " ") - return "\n\n```\n" + innerHTML + "```\n" - } - if len(attrs) > 1 { - return "`" + attrs[1] + "`" - } - return "" - }, - }) - text = html2md.Convert(text) - - // colons: revert temp token - // See: previous comment on colons - text = regexp.MustCompile(`(ː)([ $])`).ReplaceAllString(text, ":$2") - - flog.Debugf("post-MDconvert:"+text) - text = html.UnescapeString(text) - flog.Debugf("post-unescaped:"+text) - - if dest.Protocol == "slack" { - // Attribution will be in attachment for Slack - } else { - text = text + gw.Router.General.TranslationAttribution - } - - msg.Text = text -} - func (gw *Gateway) handleMessage(msg config.Message, dest *bridge.Bridge) []*BrMsgID { var brMsgIDs []*BrMsgID @@ -512,8 +267,7 @@ func (gw *Gateway) handleMessage(msg config.Message, dest *bridge.Bridge) []*BrM msg.Username = gw.modifyUsername(origmsg, dest) msg.ID = "" - // Translation - if (gw.Router.GTClient != nil) { + if gw.translationEnabled() { gw.handleTranslation(&msg, dest, channel) } diff --git a/gateway/translation.go b/gateway/translation.go new file mode 100644 index 00000000..96f8d12c --- /dev/null +++ b/gateway/translation.go @@ -0,0 +1,199 @@ +package gateway + +import ( + "context" + "fmt" + "html" + "regexp" + "sort" + "strings" + + "github.com/42wim/matterbridge/bridge" + "github.com/42wim/matterbridge/bridge/config" + + "github.com/russross/blackfriday" + "github.com/urakozz/go-emoji" + "github.com/lunny/html2md" + "github.com/darkoatanasovski/htmltags" + "golang.org/x/text/language" + "cloud.google.com/go/translate" +) + +func (gw *Gateway) translationEnabled() bool { + return gw.Router.GTClient != nil +} + +func (gw *Gateway) handleTranslation(msg *config.Message, dest *bridge.Bridge, channel config.ChannelInfo) { + // Skip if channel locale not set + if channel.Options.Locale == "" { + return + } + + // Don't try to translate empty messages + if msg.OrigMsg.Text == "" { + return + } + + msg.IsTranslation = true + ctx := context.Background() + + client := gw.Router.GTClient + defer client.Close() + + text := msg.Text + var results [][]string + + // colons: add temp token + // This is an ugly hack to work around what seems to be a bug in the Google Translate API. + // See: https://github.com/42wim/matterbridge/pull/512#issuecomment-428910199 + text = regexp.MustCompile(`(:)([ $])`).ReplaceAllString(text, "ː$2") + + // url + url_re := regexp.MustCompile(`(((http(s)?(\:\/\/))+(www\.)?([\w\-\.\/])*(\.[a-zA-Z]{2,3}\/?))[^\s\n|]*[^.,;:\?\!\@\^\$ -])`) + text = url_re.ReplaceAllString(text, "$0") + + flog.Debugf("pre-parseMD:"+text) + + // Get rid of these wierdo bullets that Slack uses, which confuse translation + text = strings.Replace(text, "•", "-", -1) + + // Make sure we use closed
tags + const htmlFlags = blackfriday.HTML_USE_XHTML + renderer := &renderer{Html: blackfriday.HtmlRenderer(htmlFlags, "", "").(*blackfriday.Html)} + const extensions = blackfriday.LINK_TYPE_NOT_AUTOLINK | + blackfriday.EXTENSION_HARD_LINE_BREAK | + blackfriday.EXTENSION_STRIKETHROUGH | + blackfriday.EXTENSION_FENCED_CODE | + blackfriday.EXTENSION_HARD_LINE_BREAK + output := blackfriday.Markdown([]byte(text), renderer, extensions) + text = string(output) + flog.Debugf("post-parseMD:"+string(output)) + + // @usernames + results = regexp.MustCompile(`(@[a-zA-Z0-9-]+)`).FindAllStringSubmatch(text, -1) + // Sort so that longest channel names are acted on first + sort.SliceStable(results, func(i, j int) bool { + return len(results[i][1]) > len(results[j][1]) + }) + for _, r := range results { + text = regexp.MustCompile(fmt.Sprintf(`([^>])(%s)`, r[1])).ReplaceAllString(text, "$1$2") + } + + flog.Debugf("post cleanup:usernames:"+text) + + // #channels + results = regexp.MustCompile(`(#[a-zA-Z0-9-]+)`).FindAllStringSubmatch(text, -1) + // Sort so that longest channel names are acted on first + sort.SliceStable(results, func(i, j int) bool { + return len(results[i][1]) > len(results[j][1]) + }) + for _, r := range results { + // If a channel that's a substring of another channel (processed earlier) matches, it will abort due to the in front + text = regexp.MustCompile(fmt.Sprintf(`([^>])(%s)`, r[1])).ReplaceAllString(text, "$1$2") + } + + flog.Debugf("post cleanup:channels:"+text) + + // :emoji: + text = regexp.MustCompile(`:[a-z0-9-_]+?:`).ReplaceAllString(text, "$0") + + // :emoji: codepoints, ie. 💎 + text = emoji.NewEmojiParser().ReplaceAllString(text, "$0") + + flog.Debugf("post cleanup:emojis:"+text) + + channelLang, err := language.Parse(channel.Options.Locale) + if err != nil { + flog.Error(err) + } + + resp, _ := client.Translate(ctx, []string{text}, channelLang, &translate.Options{ + Format: "html", + }) + + text = resp[0].Text + flog.Debugf("post-translate:"+text) + + if resp[0].Source == channelLang { + msg.IsTranslation = false + return + } + + // Add space buffer after html before stripping, or characters after tags get merged into urls or usernames + text = regexp.MustCompile(`.+?`).ReplaceAllString(text, " $0 ") + + allowableTags := []string{ + "p", + "i", + "b", + "em", + "strong", + "br", + "del", + "blockquote", + "pre", + "code", + "li", + "ul", + "ol", + } + + stripped, _ := htmltags.Strip(text, allowableTags, false) + text = stripped.ToString() + flog.Debugf("post-strip:"+text) + html2md.AddRule("del", &html2md.Rule{ + Patterns: []string{"del"}, + Replacement: func(innerHTML string, attrs []string) string { + if len(attrs) > 1 { + // Extra spaces so that Slack will process, even though Chinese characters don't get spaces + return html2md.WrapInlineTag(attrs[1], " ~", "~ ") + } + return "" + }, + }) + // Custom override for slackdown + html2md.AddRule("b", &html2md.Rule{ + Patterns: []string{"b", "strong"}, + Replacement: func(innerHTML string, attrs []string) string { + if len(attrs) > 1 { + // trailing whitespace due to Mandarin issues + return html2md.WrapInlineTag(attrs[1], "*", "* ") + } + return "" + }, + }) + // Custom override of default code rule: + // This converts multiline code tags to codeblocks + html2md.AddRule("code", &html2md.Rule{ + Patterns: []string{"code", "tt", "pre"}, + Replacement: func(innerHTML string, attrs []string) string { + contents := attrs[1] + if strings.Contains(contents, "\n") { + r := regexp.MustCompile(`/^\t+`) + innerHTML = r.ReplaceAllString(contents, " ") + return "\n\n```\n" + innerHTML + "```\n" + } + if len(attrs) > 1 { + return "`" + attrs[1] + "`" + } + return "" + }, + }) + text = html2md.Convert(text) + + // colons: revert temp token + // See: previous comment on colons + text = regexp.MustCompile(`(ː)([ $])`).ReplaceAllString(text, ":$2") + + flog.Debugf("post-MDconvert:"+text) + text = html.UnescapeString(text) + flog.Debugf("post-unescaped:"+text) + + if dest.Protocol == "slack" { + // Attribution will be in attachment for Slack + } else { + text = text + gw.Router.General.TranslationAttribution + } + + msg.Text = text +}