Extracted translation code in separate files.

2018-10-24 01:10:10 +08:00
parent fc8fe48b49
commit ccadda3ec2
3 changed files with 269 additions and 247 deletions
@@ -0,0 +1,69 @@
+package gateway
+
+import (
+	"bytes"
+	"strings"
+
+	"github.com/russross/blackfriday"
+)
+
+type renderer struct {
+	*blackfriday.Html
+}
+
+func doubleSpace(out *bytes.Buffer) {
+	if out.Len() > 0 {
+		out.WriteByte('\n')
+	}
+}
+
+func escapeSingleChar(char byte) (string, bool) {
+	if char == '"' {
+		return "&quot;", true
+	}
+	if char == '&' {
+		return "&amp;", true
+	}
+	if char == '<' {
+		return "&lt;", true
+	}
+	if char == '>' {
+		return "&gt;", true
+	}
+	return "", false
+}
+
+func attrEscape(out *bytes.Buffer, src []byte) {
+	org := 0
+	for i, ch := range src {
+		if entity, ok := escapeSingleChar(ch); ok {
+			if i > org {
+				// copy all the normal characters since the last escape
+				out.Write(src[org:i])
+			}
+			org = i + 1
+			out.WriteString(entity)
+		}
+	}
+	if org < len(src) {
+		out.Write(src[org:])
+	}
+}
+
+// Using <code> rather than <pre> keeps Google Translate from trying to process it.
+// BUT it collapses code into one line for some reason, and <pre> preserves newlines.
+// #TODO Investigating the <pre><code> combo might work.
+func (*renderer) BlockCode(out *bytes.Buffer, text []byte, info string) {
+	doubleSpace(out)
+
+	endOfLang := strings.IndexAny(info, "\t ")
+	if endOfLang < 0 {
+		endOfLang = len(info)
+	}
+	lang := info[:endOfLang]
+	if len(lang) == 0 || lang == "." {
+		out.WriteString("<pre translate='no'>")
+	}
+	attrEscape(out, text)
+	out.WriteString("</pre>\n")
+}
@@ -6,12 +6,6 @@ import (
 	"io/ioutil"
 	"net/http"
 	"os"
-	"context"
-	"html"
-	"sort"
-	"github.com/darkoatanasovski/htmltags"
-	"github.com/urakozz/go-emoji"
-	"github.com/lunny/html2md"

 	"github.com/42wim/matterbridge/bridge"
 	"github.com/42wim/matterbridge/bridge/api"
@@ -29,11 +23,8 @@ import (
 	bxmpp "github.com/42wim/matterbridge/bridge/xmpp"
 	bzulip "github.com/42wim/matterbridge/bridge/zulip"
 	"github.com/hashicorp/golang-lru"
-	"github.com/russross/blackfriday"
 	log "github.com/sirupsen/logrus"
 	//	"github.com/davecgh/go-spew/spew"
-	"cloud.google.com/go/translate"
-	"golang.org/x/text/language"
 	"crypto/sha1"
 	"path/filepath"
 	"regexp"
@@ -215,242 +206,6 @@ func (gw *Gateway) getDestChannel(msg *config.Message, dest bridge.Bridge) []con
 	return channels
 }

-type renderer struct {
-	*blackfriday.Html
-}
-
-func doubleSpace(out *bytes.Buffer) {
-	if out.Len() > 0 {
-		out.WriteByte('\n')
-	}
-}
-
-func escapeSingleChar(char byte) (string, bool) {
-	if char == '"' {
-		return "&quot;", true
-	}
-	if char == '&' {
-		return "&amp;", true
-	}
-	if char == '<' {
-		return "&lt;", true
-	}
-	if char == '>' {
-		return "&gt;", true
-	}
-	return "", false
-}
-
-func attrEscape(out *bytes.Buffer, src []byte) {
-	org := 0
-	for i, ch := range src {
-		if entity, ok := escapeSingleChar(ch); ok {
-			if i > org {
-				// copy all the normal characters since the last escape
-				out.Write(src[org:i])
-			}
-			org = i + 1
-			out.WriteString(entity)
-		}
-	}
-	if org < len(src) {
-		out.Write(src[org:])
-	}
-}
-
-// Using <code> rather than <pre> keeps Google Translate from trying to process it.
-// BUT it collapses code into one line for some reason, and <pre> preserves newlines.
-// #TODO Investigating the <pre><code> combo might work.
-func (*renderer) BlockCode(out *bytes.Buffer, text []byte, info string) {
-	doubleSpace(out)
-
-	endOfLang := strings.IndexAny(info, "\t ")
-	if endOfLang < 0 {
-		endOfLang = len(info)
-	}
-	lang := info[:endOfLang]
-	if len(lang) == 0 || lang == "." {
-		out.WriteString("<pre translate='no'>")
-	}
-	attrEscape(out, text)
-	out.WriteString("</pre>\n")
-}
-
-func (gw *Gateway) handleTranslation(msg *config.Message, dest *bridge.Bridge, channel config.ChannelInfo) {
-	// Skip if channel locale not set
-	if channel.Options.Locale == "" {
-		return
-	}
-
-	// Don't try to translate empty messages
-	if msg.OrigMsg.Text == "" {
-		return
-	}
-
-	msg.IsTranslation = true
-	ctx := context.Background()
-
-	client := gw.Router.GTClient
-	defer client.Close()
-
-	text := msg.Text
-	var results [][]string
-
-	// colons: add temp token
-	// This is an ugly hack to work around what seems to be a bug in the Google Translate API.
-	// See: https://github.com/42wim/matterbridge/pull/512#issuecomment-428910199
-	text = regexp.MustCompile(`(:)([ $])`).ReplaceAllString(text, "<span translate='no'>ː$2</span>")
-
-	// url
-	url_re := regexp.MustCompile(`(((http(s)?(\:\/\/))+(www\.)?([\w\-\.\/])*(\.[a-zA-Z]{2,3}\/?))[^\s\n|]*[^.,;:\?\!\@\^\$ -])`)
-	text = url_re.ReplaceAllString(text, "<span translate='no'>$0</span>")
-
-	flog.Debugf("pre-parseMD:"+text)
-
-	// Get rid of these wierdo bullets that Slack uses, which confuse translation
-	text = strings.Replace(text, "•", "-", -1)
-
-	// Make sure we use closed <br/> tags
-	const htmlFlags = blackfriday.HTML_USE_XHTML
-	renderer := &renderer{Html: blackfriday.HtmlRenderer(htmlFlags, "", "").(*blackfriday.Html)}
-	const extensions = blackfriday.LINK_TYPE_NOT_AUTOLINK |
-		blackfriday.EXTENSION_HARD_LINE_BREAK |
-		blackfriday.EXTENSION_STRIKETHROUGH |
-		blackfriday.EXTENSION_FENCED_CODE |
-		blackfriday.EXTENSION_HARD_LINE_BREAK
-	output := blackfriday.Markdown([]byte(text), renderer, extensions)
-	text = string(output)
-	flog.Debugf("post-parseMD:"+string(output))
-
-	// @usernames
-	results = regexp.MustCompile(`(@[a-zA-Z0-9-]+)`).FindAllStringSubmatch(text, -1)
-	// Sort so that longest channel names are acted on first
-	sort.SliceStable(results, func(i, j int) bool {
-		return len(results[i][1]) > len(results[j][1])
-	})
-	for _, r := range results {
-		text = regexp.MustCompile(fmt.Sprintf(`([^>])(%s)`, r[1])).ReplaceAllString(text, "$1<span translate='no'>$2</span>")
-	}
-
-	flog.Debugf("post cleanup:usernames:"+text)
-
-	// #channels
-	results = regexp.MustCompile(`(#[a-zA-Z0-9-]+)`).FindAllStringSubmatch(text, -1)
-	// Sort so that longest channel names are acted on first
-	sort.SliceStable(results, func(i, j int) bool {
-		return len(results[i][1]) > len(results[j][1])
-	})
-	for _, r := range results {
-		// If a channel that's a substring of another channel (processed earlier)  matches, it will abort due to the <tag> in front
-		text = regexp.MustCompile(fmt.Sprintf(`([^>])(%s)`, r[1])).ReplaceAllString(text, "$1<span translate='no'>$2</span>")
-	}
-
-	flog.Debugf("post cleanup:channels:"+text)
-
-	// :emoji:
-	text = regexp.MustCompile(`:[a-z0-9-_]+?:`).ReplaceAllString(text, "<span translate='no'>$0</span>")
-
-	// :emoji: codepoints, ie. 💎
-	text = emoji.NewEmojiParser().ReplaceAllString(text, "<span translate='no'>$0</span>")
-
-	flog.Debugf("post cleanup:emojis:"+text)
-
-	channelLang, err := language.Parse(channel.Options.Locale)
-	if err != nil {
-		flog.Error(err)
-	}
-
-	resp, _ := client.Translate(ctx, []string{text}, channelLang, &translate.Options{
-		Format: "html",
-	})
-
-	text = resp[0].Text
-	flog.Debugf("post-translate:"+text)
-
-	if resp[0].Source == channelLang {
-		msg.IsTranslation = false
-		return
-	}
-
-	// Add space buffer after html <span> before stripping, or characters after tags get merged into urls or usernames
-	text = regexp.MustCompile(`<span translate='no'>.+?</span>`).ReplaceAllString(text, " $0 ")
-
-	allowableTags := []string{
-		"p",
-		"i",
-		"b",
-		"em",
-		"strong",
-		"br",
-		"del",
-		"blockquote",
-		"pre",
-		"code",
-		"li",
-		"ul",
-		"ol",
-	}
-
-	stripped, _ := htmltags.Strip(text, allowableTags, false)
-	text = stripped.ToString()
-	flog.Debugf("post-strip:"+text)
-	html2md.AddRule("del", &html2md.Rule{
-		Patterns: []string{"del"},
-		Replacement: func(innerHTML string, attrs []string) string {
-			if len(attrs) > 1 {
-				// Extra spaces so that Slack will process, even though Chinese characters don't get spaces
-				return html2md.WrapInlineTag(attrs[1], " ~", "~ ")
-			}
-			return ""
-		},
-	})
-	// Custom override for slackdown
-	html2md.AddRule("b", &html2md.Rule{
-		Patterns: []string{"b", "strong"},
-		Replacement: func(innerHTML string, attrs []string) string {
-			if len(attrs) > 1 {
-				// trailing whitespace due to Mandarin issues
-				return html2md.WrapInlineTag(attrs[1], "*", "* ")
-			}
-			return ""
-		},
-	})
-	// Custom override of default code rule:
-	// This converts multiline code tags to codeblocks
-	html2md.AddRule("code", &html2md.Rule{
-		Patterns: []string{"code", "tt", "pre"},
-		Replacement: func(innerHTML string, attrs []string) string {
-			contents := attrs[1]
-			if strings.Contains(contents, "\n") {
-				r := regexp.MustCompile(`/^\t+`)
-				innerHTML = r.ReplaceAllString(contents, "  ")
-				return "\n\n```\n" + innerHTML + "```\n"
-			}
-			if len(attrs) > 1 {
-				return "`" + attrs[1] + "`"
-			}
-			return ""
-		},
-	})
-	text = html2md.Convert(text)
-
-	// colons: revert temp token
-	// See: previous comment on colons
-	text = regexp.MustCompile(`(ː)([ $])`).ReplaceAllString(text, ":$2")
-
-	flog.Debugf("post-MDconvert:"+text)
-	text = html.UnescapeString(text)
-	flog.Debugf("post-unescaped:"+text)
-
-	if dest.Protocol == "slack" {
-		// Attribution will be in attachment for Slack
-	} else {
-		text = text + gw.Router.General.TranslationAttribution
-	}
-
-	msg.Text = text
-}
-
 func (gw *Gateway) handleMessage(msg config.Message, dest *bridge.Bridge) []*BrMsgID {
 	var brMsgIDs []*BrMsgID

@@ -512,8 +267,7 @@ func (gw *Gateway) handleMessage(msg config.Message, dest *bridge.Bridge) []*BrM
 		msg.Username = gw.modifyUsername(origmsg, dest)
 		msg.ID = ""

-		// Translation
-		if (gw.Router.GTClient != nil) {
+		if gw.translationEnabled() {
 			gw.handleTranslation(&msg, dest, channel)
 		}

@@ -0,0 +1,199 @@
+package gateway
+
+import (
+	"context"
+	"fmt"
+	"html"
+	"regexp"
+	"sort"
+	"strings"
+
+	"github.com/42wim/matterbridge/bridge"
+	"github.com/42wim/matterbridge/bridge/config"
+
+	"github.com/russross/blackfriday"
+	"github.com/urakozz/go-emoji"
+	"github.com/lunny/html2md"
+	"github.com/darkoatanasovski/htmltags"
+	"golang.org/x/text/language"
+	"cloud.google.com/go/translate"
+)
+
+func (gw *Gateway) translationEnabled() bool {
+	return gw.Router.GTClient != nil
+}
+
+func (gw *Gateway) handleTranslation(msg *config.Message, dest *bridge.Bridge, channel config.ChannelInfo) {
+	// Skip if channel locale not set
+	if channel.Options.Locale == "" {
+		return
+	}
+
+	// Don't try to translate empty messages
+	if msg.OrigMsg.Text == "" {
+		return
+	}
+
+	msg.IsTranslation = true
+	ctx := context.Background()
+
+	client := gw.Router.GTClient
+	defer client.Close()
+
+	text := msg.Text
+	var results [][]string
+
+	// colons: add temp token
+	// This is an ugly hack to work around what seems to be a bug in the Google Translate API.
+	// See: https://github.com/42wim/matterbridge/pull/512#issuecomment-428910199
+	text = regexp.MustCompile(`(:)([ $])`).ReplaceAllString(text, "<span translate='no'>ː$2</span>")
+
+	// url
+	url_re := regexp.MustCompile(`(((http(s)?(\:\/\/))+(www\.)?([\w\-\.\/])*(\.[a-zA-Z]{2,3}\/?))[^\s\n|]*[^.,;:\?\!\@\^\$ -])`)
+	text = url_re.ReplaceAllString(text, "<span translate='no'>$0</span>")
+
+	flog.Debugf("pre-parseMD:"+text)
+
+	// Get rid of these wierdo bullets that Slack uses, which confuse translation
+	text = strings.Replace(text, "•", "-", -1)
+
+	// Make sure we use closed <br/> tags
+	const htmlFlags = blackfriday.HTML_USE_XHTML
+	renderer := &renderer{Html: blackfriday.HtmlRenderer(htmlFlags, "", "").(*blackfriday.Html)}
+	const extensions = blackfriday.LINK_TYPE_NOT_AUTOLINK |
+		blackfriday.EXTENSION_HARD_LINE_BREAK |
+		blackfriday.EXTENSION_STRIKETHROUGH |
+		blackfriday.EXTENSION_FENCED_CODE |
+		blackfriday.EXTENSION_HARD_LINE_BREAK
+	output := blackfriday.Markdown([]byte(text), renderer, extensions)
+	text = string(output)
+	flog.Debugf("post-parseMD:"+string(output))
+
+	// @usernames
+	results = regexp.MustCompile(`(@[a-zA-Z0-9-]+)`).FindAllStringSubmatch(text, -1)
+	// Sort so that longest channel names are acted on first
+	sort.SliceStable(results, func(i, j int) bool {
+		return len(results[i][1]) > len(results[j][1])
+	})
+	for _, r := range results {
+		text = regexp.MustCompile(fmt.Sprintf(`([^>])(%s)`, r[1])).ReplaceAllString(text, "$1<span translate='no'>$2</span>")
+	}
+
+	flog.Debugf("post cleanup:usernames:"+text)
+
+	// #channels
+	results = regexp.MustCompile(`(#[a-zA-Z0-9-]+)`).FindAllStringSubmatch(text, -1)
+	// Sort so that longest channel names are acted on first
+	sort.SliceStable(results, func(i, j int) bool {
+		return len(results[i][1]) > len(results[j][1])
+	})
+	for _, r := range results {
+		// If a channel that's a substring of another channel (processed earlier)  matches, it will abort due to the <tag> in front
+		text = regexp.MustCompile(fmt.Sprintf(`([^>])(%s)`, r[1])).ReplaceAllString(text, "$1<span translate='no'>$2</span>")
+	}
+
+	flog.Debugf("post cleanup:channels:"+text)
+
+	// :emoji:
+	text = regexp.MustCompile(`:[a-z0-9-_]+?:`).ReplaceAllString(text, "<span translate='no'>$0</span>")
+
+	// :emoji: codepoints, ie. 💎
+	text = emoji.NewEmojiParser().ReplaceAllString(text, "<span translate='no'>$0</span>")
+
+	flog.Debugf("post cleanup:emojis:"+text)
+
+	channelLang, err := language.Parse(channel.Options.Locale)
+	if err != nil {
+		flog.Error(err)
+	}
+
+	resp, _ := client.Translate(ctx, []string{text}, channelLang, &translate.Options{
+		Format: "html",
+	})
+
+	text = resp[0].Text
+	flog.Debugf("post-translate:"+text)
+
+	if resp[0].Source == channelLang {
+		msg.IsTranslation = false
+		return
+	}
+
+	// Add space buffer after html <span> before stripping, or characters after tags get merged into urls or usernames
+	text = regexp.MustCompile(`<span translate='no'>.+?</span>`).ReplaceAllString(text, " $0 ")
+
+	allowableTags := []string{
+		"p",
+		"i",
+		"b",
+		"em",
+		"strong",
+		"br",
+		"del",
+		"blockquote",
+		"pre",
+		"code",
+		"li",
+		"ul",
+		"ol",
+	}
+
+	stripped, _ := htmltags.Strip(text, allowableTags, false)
+	text = stripped.ToString()
+	flog.Debugf("post-strip:"+text)
+	html2md.AddRule("del", &html2md.Rule{
+		Patterns: []string{"del"},
+		Replacement: func(innerHTML string, attrs []string) string {
+			if len(attrs) > 1 {
+				// Extra spaces so that Slack will process, even though Chinese characters don't get spaces
+				return html2md.WrapInlineTag(attrs[1], " ~", "~ ")
+			}
+			return ""
+		},
+	})
+	// Custom override for slackdown
+	html2md.AddRule("b", &html2md.Rule{
+		Patterns: []string{"b", "strong"},
+		Replacement: func(innerHTML string, attrs []string) string {
+			if len(attrs) > 1 {
+				// trailing whitespace due to Mandarin issues
+				return html2md.WrapInlineTag(attrs[1], "*", "* ")
+			}
+			return ""
+		},
+	})
+	// Custom override of default code rule:
+	// This converts multiline code tags to codeblocks
+	html2md.AddRule("code", &html2md.Rule{
+		Patterns: []string{"code", "tt", "pre"},
+		Replacement: func(innerHTML string, attrs []string) string {
+			contents := attrs[1]
+			if strings.Contains(contents, "\n") {
+				r := regexp.MustCompile(`/^\t+`)
+				innerHTML = r.ReplaceAllString(contents, "  ")
+				return "\n\n```\n" + innerHTML + "```\n"
+			}
+			if len(attrs) > 1 {
+				return "`" + attrs[1] + "`"
+			}
+			return ""
+		},
+	})
+	text = html2md.Convert(text)
+
+	// colons: revert temp token
+	// See: previous comment on colons
+	text = regexp.MustCompile(`(ː)([ $])`).ReplaceAllString(text, ":$2")
+
+	flog.Debugf("post-MDconvert:"+text)
+	text = html.UnescapeString(text)
+	flog.Debugf("post-unescaped:"+text)
+
+	if dest.Protocol == "slack" {
+		// Attribution will be in attachment for Slack
+	} else {
+		text = text + gw.Router.General.TranslationAttribution
+	}
+
+	msg.Text = text
+}