Extracted translation code in separate files.
This commit is contained in:
69
gateway/blackfriday.go
Normal file
69
gateway/blackfriday.go
Normal file
@@ -0,0 +1,69 @@
|
||||
package gateway
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"strings"
|
||||
|
||||
"github.com/russross/blackfriday"
|
||||
)
|
||||
|
||||
type renderer struct {
|
||||
*blackfriday.Html
|
||||
}
|
||||
|
||||
func doubleSpace(out *bytes.Buffer) {
|
||||
if out.Len() > 0 {
|
||||
out.WriteByte('\n')
|
||||
}
|
||||
}
|
||||
|
||||
func escapeSingleChar(char byte) (string, bool) {
|
||||
if char == '"' {
|
||||
return """, true
|
||||
}
|
||||
if char == '&' {
|
||||
return "&", true
|
||||
}
|
||||
if char == '<' {
|
||||
return "<", true
|
||||
}
|
||||
if char == '>' {
|
||||
return ">", true
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
|
||||
func attrEscape(out *bytes.Buffer, src []byte) {
|
||||
org := 0
|
||||
for i, ch := range src {
|
||||
if entity, ok := escapeSingleChar(ch); ok {
|
||||
if i > org {
|
||||
// copy all the normal characters since the last escape
|
||||
out.Write(src[org:i])
|
||||
}
|
||||
org = i + 1
|
||||
out.WriteString(entity)
|
||||
}
|
||||
}
|
||||
if org < len(src) {
|
||||
out.Write(src[org:])
|
||||
}
|
||||
}
|
||||
|
||||
// Using <code> rather than <pre> keeps Google Translate from trying to process it.
|
||||
// BUT it collapses code into one line for some reason, and <pre> preserves newlines.
|
||||
// #TODO Investigating the <pre><code> combo might work.
|
||||
func (*renderer) BlockCode(out *bytes.Buffer, text []byte, info string) {
|
||||
doubleSpace(out)
|
||||
|
||||
endOfLang := strings.IndexAny(info, "\t ")
|
||||
if endOfLang < 0 {
|
||||
endOfLang = len(info)
|
||||
}
|
||||
lang := info[:endOfLang]
|
||||
if len(lang) == 0 || lang == "." {
|
||||
out.WriteString("<pre translate='no'>")
|
||||
}
|
||||
attrEscape(out, text)
|
||||
out.WriteString("</pre>\n")
|
||||
}
|
||||
@@ -6,12 +6,6 @@ import (
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"os"
|
||||
"context"
|
||||
"html"
|
||||
"sort"
|
||||
"github.com/darkoatanasovski/htmltags"
|
||||
"github.com/urakozz/go-emoji"
|
||||
"github.com/lunny/html2md"
|
||||
|
||||
"github.com/42wim/matterbridge/bridge"
|
||||
"github.com/42wim/matterbridge/bridge/api"
|
||||
@@ -29,11 +23,8 @@ import (
|
||||
bxmpp "github.com/42wim/matterbridge/bridge/xmpp"
|
||||
bzulip "github.com/42wim/matterbridge/bridge/zulip"
|
||||
"github.com/hashicorp/golang-lru"
|
||||
"github.com/russross/blackfriday"
|
||||
log "github.com/sirupsen/logrus"
|
||||
// "github.com/davecgh/go-spew/spew"
|
||||
"cloud.google.com/go/translate"
|
||||
"golang.org/x/text/language"
|
||||
"crypto/sha1"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
@@ -215,242 +206,6 @@ func (gw *Gateway) getDestChannel(msg *config.Message, dest bridge.Bridge) []con
|
||||
return channels
|
||||
}
|
||||
|
||||
type renderer struct {
|
||||
*blackfriday.Html
|
||||
}
|
||||
|
||||
func doubleSpace(out *bytes.Buffer) {
|
||||
if out.Len() > 0 {
|
||||
out.WriteByte('\n')
|
||||
}
|
||||
}
|
||||
|
||||
func escapeSingleChar(char byte) (string, bool) {
|
||||
if char == '"' {
|
||||
return """, true
|
||||
}
|
||||
if char == '&' {
|
||||
return "&", true
|
||||
}
|
||||
if char == '<' {
|
||||
return "<", true
|
||||
}
|
||||
if char == '>' {
|
||||
return ">", true
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
|
||||
func attrEscape(out *bytes.Buffer, src []byte) {
|
||||
org := 0
|
||||
for i, ch := range src {
|
||||
if entity, ok := escapeSingleChar(ch); ok {
|
||||
if i > org {
|
||||
// copy all the normal characters since the last escape
|
||||
out.Write(src[org:i])
|
||||
}
|
||||
org = i + 1
|
||||
out.WriteString(entity)
|
||||
}
|
||||
}
|
||||
if org < len(src) {
|
||||
out.Write(src[org:])
|
||||
}
|
||||
}
|
||||
|
||||
// Using <code> rather than <pre> keeps Google Translate from trying to process it.
|
||||
// BUT it collapses code into one line for some reason, and <pre> preserves newlines.
|
||||
// #TODO Investigating the <pre><code> combo might work.
|
||||
func (*renderer) BlockCode(out *bytes.Buffer, text []byte, info string) {
|
||||
doubleSpace(out)
|
||||
|
||||
endOfLang := strings.IndexAny(info, "\t ")
|
||||
if endOfLang < 0 {
|
||||
endOfLang = len(info)
|
||||
}
|
||||
lang := info[:endOfLang]
|
||||
if len(lang) == 0 || lang == "." {
|
||||
out.WriteString("<pre translate='no'>")
|
||||
}
|
||||
attrEscape(out, text)
|
||||
out.WriteString("</pre>\n")
|
||||
}
|
||||
|
||||
func (gw *Gateway) handleTranslation(msg *config.Message, dest *bridge.Bridge, channel config.ChannelInfo) {
|
||||
// Skip if channel locale not set
|
||||
if channel.Options.Locale == "" {
|
||||
return
|
||||
}
|
||||
|
||||
// Don't try to translate empty messages
|
||||
if msg.OrigMsg.Text == "" {
|
||||
return
|
||||
}
|
||||
|
||||
msg.IsTranslation = true
|
||||
ctx := context.Background()
|
||||
|
||||
client := gw.Router.GTClient
|
||||
defer client.Close()
|
||||
|
||||
text := msg.Text
|
||||
var results [][]string
|
||||
|
||||
// colons: add temp token
|
||||
// This is an ugly hack to work around what seems to be a bug in the Google Translate API.
|
||||
// See: https://github.com/42wim/matterbridge/pull/512#issuecomment-428910199
|
||||
text = regexp.MustCompile(`(:)([ $])`).ReplaceAllString(text, "<span translate='no'>ː$2</span>")
|
||||
|
||||
// url
|
||||
url_re := regexp.MustCompile(`(((http(s)?(\:\/\/))+(www\.)?([\w\-\.\/])*(\.[a-zA-Z]{2,3}\/?))[^\s\n|]*[^.,;:\?\!\@\^\$ -])`)
|
||||
text = url_re.ReplaceAllString(text, "<span translate='no'>$0</span>")
|
||||
|
||||
flog.Debugf("pre-parseMD:"+text)
|
||||
|
||||
// Get rid of these wierdo bullets that Slack uses, which confuse translation
|
||||
text = strings.Replace(text, "•", "-", -1)
|
||||
|
||||
// Make sure we use closed <br/> tags
|
||||
const htmlFlags = blackfriday.HTML_USE_XHTML
|
||||
renderer := &renderer{Html: blackfriday.HtmlRenderer(htmlFlags, "", "").(*blackfriday.Html)}
|
||||
const extensions = blackfriday.LINK_TYPE_NOT_AUTOLINK |
|
||||
blackfriday.EXTENSION_HARD_LINE_BREAK |
|
||||
blackfriday.EXTENSION_STRIKETHROUGH |
|
||||
blackfriday.EXTENSION_FENCED_CODE |
|
||||
blackfriday.EXTENSION_HARD_LINE_BREAK
|
||||
output := blackfriday.Markdown([]byte(text), renderer, extensions)
|
||||
text = string(output)
|
||||
flog.Debugf("post-parseMD:"+string(output))
|
||||
|
||||
// @usernames
|
||||
results = regexp.MustCompile(`(@[a-zA-Z0-9-]+)`).FindAllStringSubmatch(text, -1)
|
||||
// Sort so that longest channel names are acted on first
|
||||
sort.SliceStable(results, func(i, j int) bool {
|
||||
return len(results[i][1]) > len(results[j][1])
|
||||
})
|
||||
for _, r := range results {
|
||||
text = regexp.MustCompile(fmt.Sprintf(`([^>])(%s)`, r[1])).ReplaceAllString(text, "$1<span translate='no'>$2</span>")
|
||||
}
|
||||
|
||||
flog.Debugf("post cleanup:usernames:"+text)
|
||||
|
||||
// #channels
|
||||
results = regexp.MustCompile(`(#[a-zA-Z0-9-]+)`).FindAllStringSubmatch(text, -1)
|
||||
// Sort so that longest channel names are acted on first
|
||||
sort.SliceStable(results, func(i, j int) bool {
|
||||
return len(results[i][1]) > len(results[j][1])
|
||||
})
|
||||
for _, r := range results {
|
||||
// If a channel that's a substring of another channel (processed earlier) matches, it will abort due to the <tag> in front
|
||||
text = regexp.MustCompile(fmt.Sprintf(`([^>])(%s)`, r[1])).ReplaceAllString(text, "$1<span translate='no'>$2</span>")
|
||||
}
|
||||
|
||||
flog.Debugf("post cleanup:channels:"+text)
|
||||
|
||||
// :emoji:
|
||||
text = regexp.MustCompile(`:[a-z0-9-_]+?:`).ReplaceAllString(text, "<span translate='no'>$0</span>")
|
||||
|
||||
// :emoji: codepoints, ie. 💎
|
||||
text = emoji.NewEmojiParser().ReplaceAllString(text, "<span translate='no'>$0</span>")
|
||||
|
||||
flog.Debugf("post cleanup:emojis:"+text)
|
||||
|
||||
channelLang, err := language.Parse(channel.Options.Locale)
|
||||
if err != nil {
|
||||
flog.Error(err)
|
||||
}
|
||||
|
||||
resp, _ := client.Translate(ctx, []string{text}, channelLang, &translate.Options{
|
||||
Format: "html",
|
||||
})
|
||||
|
||||
text = resp[0].Text
|
||||
flog.Debugf("post-translate:"+text)
|
||||
|
||||
if resp[0].Source == channelLang {
|
||||
msg.IsTranslation = false
|
||||
return
|
||||
}
|
||||
|
||||
// Add space buffer after html <span> before stripping, or characters after tags get merged into urls or usernames
|
||||
text = regexp.MustCompile(`<span translate='no'>.+?</span>`).ReplaceAllString(text, " $0 ")
|
||||
|
||||
allowableTags := []string{
|
||||
"p",
|
||||
"i",
|
||||
"b",
|
||||
"em",
|
||||
"strong",
|
||||
"br",
|
||||
"del",
|
||||
"blockquote",
|
||||
"pre",
|
||||
"code",
|
||||
"li",
|
||||
"ul",
|
||||
"ol",
|
||||
}
|
||||
|
||||
stripped, _ := htmltags.Strip(text, allowableTags, false)
|
||||
text = stripped.ToString()
|
||||
flog.Debugf("post-strip:"+text)
|
||||
html2md.AddRule("del", &html2md.Rule{
|
||||
Patterns: []string{"del"},
|
||||
Replacement: func(innerHTML string, attrs []string) string {
|
||||
if len(attrs) > 1 {
|
||||
// Extra spaces so that Slack will process, even though Chinese characters don't get spaces
|
||||
return html2md.WrapInlineTag(attrs[1], " ~", "~ ")
|
||||
}
|
||||
return ""
|
||||
},
|
||||
})
|
||||
// Custom override for slackdown
|
||||
html2md.AddRule("b", &html2md.Rule{
|
||||
Patterns: []string{"b", "strong"},
|
||||
Replacement: func(innerHTML string, attrs []string) string {
|
||||
if len(attrs) > 1 {
|
||||
// trailing whitespace due to Mandarin issues
|
||||
return html2md.WrapInlineTag(attrs[1], "*", "* ")
|
||||
}
|
||||
return ""
|
||||
},
|
||||
})
|
||||
// Custom override of default code rule:
|
||||
// This converts multiline code tags to codeblocks
|
||||
html2md.AddRule("code", &html2md.Rule{
|
||||
Patterns: []string{"code", "tt", "pre"},
|
||||
Replacement: func(innerHTML string, attrs []string) string {
|
||||
contents := attrs[1]
|
||||
if strings.Contains(contents, "\n") {
|
||||
r := regexp.MustCompile(`/^\t+`)
|
||||
innerHTML = r.ReplaceAllString(contents, " ")
|
||||
return "\n\n```\n" + innerHTML + "```\n"
|
||||
}
|
||||
if len(attrs) > 1 {
|
||||
return "`" + attrs[1] + "`"
|
||||
}
|
||||
return ""
|
||||
},
|
||||
})
|
||||
text = html2md.Convert(text)
|
||||
|
||||
// colons: revert temp token
|
||||
// See: previous comment on colons
|
||||
text = regexp.MustCompile(`(ː)([ $])`).ReplaceAllString(text, ":$2")
|
||||
|
||||
flog.Debugf("post-MDconvert:"+text)
|
||||
text = html.UnescapeString(text)
|
||||
flog.Debugf("post-unescaped:"+text)
|
||||
|
||||
if dest.Protocol == "slack" {
|
||||
// Attribution will be in attachment for Slack
|
||||
} else {
|
||||
text = text + gw.Router.General.TranslationAttribution
|
||||
}
|
||||
|
||||
msg.Text = text
|
||||
}
|
||||
|
||||
func (gw *Gateway) handleMessage(msg config.Message, dest *bridge.Bridge) []*BrMsgID {
|
||||
var brMsgIDs []*BrMsgID
|
||||
|
||||
@@ -512,8 +267,7 @@ func (gw *Gateway) handleMessage(msg config.Message, dest *bridge.Bridge) []*BrM
|
||||
msg.Username = gw.modifyUsername(origmsg, dest)
|
||||
msg.ID = ""
|
||||
|
||||
// Translation
|
||||
if (gw.Router.GTClient != nil) {
|
||||
if gw.translationEnabled() {
|
||||
gw.handleTranslation(&msg, dest, channel)
|
||||
}
|
||||
|
||||
|
||||
199
gateway/translation.go
Normal file
199
gateway/translation.go
Normal file
@@ -0,0 +1,199 @@
|
||||
package gateway
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"html"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/42wim/matterbridge/bridge"
|
||||
"github.com/42wim/matterbridge/bridge/config"
|
||||
|
||||
"github.com/russross/blackfriday"
|
||||
"github.com/urakozz/go-emoji"
|
||||
"github.com/lunny/html2md"
|
||||
"github.com/darkoatanasovski/htmltags"
|
||||
"golang.org/x/text/language"
|
||||
"cloud.google.com/go/translate"
|
||||
)
|
||||
|
||||
func (gw *Gateway) translationEnabled() bool {
|
||||
return gw.Router.GTClient != nil
|
||||
}
|
||||
|
||||
func (gw *Gateway) handleTranslation(msg *config.Message, dest *bridge.Bridge, channel config.ChannelInfo) {
|
||||
// Skip if channel locale not set
|
||||
if channel.Options.Locale == "" {
|
||||
return
|
||||
}
|
||||
|
||||
// Don't try to translate empty messages
|
||||
if msg.OrigMsg.Text == "" {
|
||||
return
|
||||
}
|
||||
|
||||
msg.IsTranslation = true
|
||||
ctx := context.Background()
|
||||
|
||||
client := gw.Router.GTClient
|
||||
defer client.Close()
|
||||
|
||||
text := msg.Text
|
||||
var results [][]string
|
||||
|
||||
// colons: add temp token
|
||||
// This is an ugly hack to work around what seems to be a bug in the Google Translate API.
|
||||
// See: https://github.com/42wim/matterbridge/pull/512#issuecomment-428910199
|
||||
text = regexp.MustCompile(`(:)([ $])`).ReplaceAllString(text, "<span translate='no'>ː$2</span>")
|
||||
|
||||
// url
|
||||
url_re := regexp.MustCompile(`(((http(s)?(\:\/\/))+(www\.)?([\w\-\.\/])*(\.[a-zA-Z]{2,3}\/?))[^\s\n|]*[^.,;:\?\!\@\^\$ -])`)
|
||||
text = url_re.ReplaceAllString(text, "<span translate='no'>$0</span>")
|
||||
|
||||
flog.Debugf("pre-parseMD:"+text)
|
||||
|
||||
// Get rid of these wierdo bullets that Slack uses, which confuse translation
|
||||
text = strings.Replace(text, "•", "-", -1)
|
||||
|
||||
// Make sure we use closed <br/> tags
|
||||
const htmlFlags = blackfriday.HTML_USE_XHTML
|
||||
renderer := &renderer{Html: blackfriday.HtmlRenderer(htmlFlags, "", "").(*blackfriday.Html)}
|
||||
const extensions = blackfriday.LINK_TYPE_NOT_AUTOLINK |
|
||||
blackfriday.EXTENSION_HARD_LINE_BREAK |
|
||||
blackfriday.EXTENSION_STRIKETHROUGH |
|
||||
blackfriday.EXTENSION_FENCED_CODE |
|
||||
blackfriday.EXTENSION_HARD_LINE_BREAK
|
||||
output := blackfriday.Markdown([]byte(text), renderer, extensions)
|
||||
text = string(output)
|
||||
flog.Debugf("post-parseMD:"+string(output))
|
||||
|
||||
// @usernames
|
||||
results = regexp.MustCompile(`(@[a-zA-Z0-9-]+)`).FindAllStringSubmatch(text, -1)
|
||||
// Sort so that longest channel names are acted on first
|
||||
sort.SliceStable(results, func(i, j int) bool {
|
||||
return len(results[i][1]) > len(results[j][1])
|
||||
})
|
||||
for _, r := range results {
|
||||
text = regexp.MustCompile(fmt.Sprintf(`([^>])(%s)`, r[1])).ReplaceAllString(text, "$1<span translate='no'>$2</span>")
|
||||
}
|
||||
|
||||
flog.Debugf("post cleanup:usernames:"+text)
|
||||
|
||||
// #channels
|
||||
results = regexp.MustCompile(`(#[a-zA-Z0-9-]+)`).FindAllStringSubmatch(text, -1)
|
||||
// Sort so that longest channel names are acted on first
|
||||
sort.SliceStable(results, func(i, j int) bool {
|
||||
return len(results[i][1]) > len(results[j][1])
|
||||
})
|
||||
for _, r := range results {
|
||||
// If a channel that's a substring of another channel (processed earlier) matches, it will abort due to the <tag> in front
|
||||
text = regexp.MustCompile(fmt.Sprintf(`([^>])(%s)`, r[1])).ReplaceAllString(text, "$1<span translate='no'>$2</span>")
|
||||
}
|
||||
|
||||
flog.Debugf("post cleanup:channels:"+text)
|
||||
|
||||
// :emoji:
|
||||
text = regexp.MustCompile(`:[a-z0-9-_]+?:`).ReplaceAllString(text, "<span translate='no'>$0</span>")
|
||||
|
||||
// :emoji: codepoints, ie. 💎
|
||||
text = emoji.NewEmojiParser().ReplaceAllString(text, "<span translate='no'>$0</span>")
|
||||
|
||||
flog.Debugf("post cleanup:emojis:"+text)
|
||||
|
||||
channelLang, err := language.Parse(channel.Options.Locale)
|
||||
if err != nil {
|
||||
flog.Error(err)
|
||||
}
|
||||
|
||||
resp, _ := client.Translate(ctx, []string{text}, channelLang, &translate.Options{
|
||||
Format: "html",
|
||||
})
|
||||
|
||||
text = resp[0].Text
|
||||
flog.Debugf("post-translate:"+text)
|
||||
|
||||
if resp[0].Source == channelLang {
|
||||
msg.IsTranslation = false
|
||||
return
|
||||
}
|
||||
|
||||
// Add space buffer after html <span> before stripping, or characters after tags get merged into urls or usernames
|
||||
text = regexp.MustCompile(`<span translate='no'>.+?</span>`).ReplaceAllString(text, " $0 ")
|
||||
|
||||
allowableTags := []string{
|
||||
"p",
|
||||
"i",
|
||||
"b",
|
||||
"em",
|
||||
"strong",
|
||||
"br",
|
||||
"del",
|
||||
"blockquote",
|
||||
"pre",
|
||||
"code",
|
||||
"li",
|
||||
"ul",
|
||||
"ol",
|
||||
}
|
||||
|
||||
stripped, _ := htmltags.Strip(text, allowableTags, false)
|
||||
text = stripped.ToString()
|
||||
flog.Debugf("post-strip:"+text)
|
||||
html2md.AddRule("del", &html2md.Rule{
|
||||
Patterns: []string{"del"},
|
||||
Replacement: func(innerHTML string, attrs []string) string {
|
||||
if len(attrs) > 1 {
|
||||
// Extra spaces so that Slack will process, even though Chinese characters don't get spaces
|
||||
return html2md.WrapInlineTag(attrs[1], " ~", "~ ")
|
||||
}
|
||||
return ""
|
||||
},
|
||||
})
|
||||
// Custom override for slackdown
|
||||
html2md.AddRule("b", &html2md.Rule{
|
||||
Patterns: []string{"b", "strong"},
|
||||
Replacement: func(innerHTML string, attrs []string) string {
|
||||
if len(attrs) > 1 {
|
||||
// trailing whitespace due to Mandarin issues
|
||||
return html2md.WrapInlineTag(attrs[1], "*", "* ")
|
||||
}
|
||||
return ""
|
||||
},
|
||||
})
|
||||
// Custom override of default code rule:
|
||||
// This converts multiline code tags to codeblocks
|
||||
html2md.AddRule("code", &html2md.Rule{
|
||||
Patterns: []string{"code", "tt", "pre"},
|
||||
Replacement: func(innerHTML string, attrs []string) string {
|
||||
contents := attrs[1]
|
||||
if strings.Contains(contents, "\n") {
|
||||
r := regexp.MustCompile(`/^\t+`)
|
||||
innerHTML = r.ReplaceAllString(contents, " ")
|
||||
return "\n\n```\n" + innerHTML + "```\n"
|
||||
}
|
||||
if len(attrs) > 1 {
|
||||
return "`" + attrs[1] + "`"
|
||||
}
|
||||
return ""
|
||||
},
|
||||
})
|
||||
text = html2md.Convert(text)
|
||||
|
||||
// colons: revert temp token
|
||||
// See: previous comment on colons
|
||||
text = regexp.MustCompile(`(ː)([ $])`).ReplaceAllString(text, ":$2")
|
||||
|
||||
flog.Debugf("post-MDconvert:"+text)
|
||||
text = html.UnescapeString(text)
|
||||
flog.Debugf("post-unescaped:"+text)
|
||||
|
||||
if dest.Protocol == "slack" {
|
||||
// Attribution will be in attachment for Slack
|
||||
} else {
|
||||
text = text + gw.Router.General.TranslationAttribution
|
||||
}
|
||||
|
||||
msg.Text = text
|
||||
}
|
||||
Reference in New Issue
Block a user