diff --git a/bridge/config/config.go b/bridge/config/config.go index 951b34ed..40c5ce32 100644 --- a/bridge/config/config.go +++ b/bridge/config/config.go @@ -37,7 +37,8 @@ type Message struct { Timestamp time.Time `json:"timestamp"` ID string `json:"id"` Extra map[string][]interface{} - TranslationSrcMsg *Message + OrigMsg *Message + IsTranslation bool } type FileInfo struct { diff --git a/bridge/slack/slack.go b/bridge/slack/slack.go index 637eb515..08d100ed 100644 --- a/bridge/slack/slack.go +++ b/bridge/slack/slack.go @@ -231,7 +231,7 @@ func (b *Bslack) Send(msg config.Message) (string, error) { // add file attachments np.Attachments = append(np.Attachments, b.createAttach(msg.Extra)...) // add translation attachment - if msg.TranslationSrcMsg != nil { + if msg.IsTranslation { // If source, then we're doing a translation np.Attachments = append(np.Attachments, b.createTranslationAttach(msg)) } @@ -291,14 +291,14 @@ func (b *Bslack) createAttach(extra map[string][]interface{}) []slack.Attachment } func (b *Bslack) createTranslationAttach(msg config.Message) slack.Attachment { - untranslatedTextPreview := msg.TranslationSrcMsg.Text + untranslatedTextPreview := msg.OrigMsg.Text previewCharCount := 100 - if len(msg.TranslationSrcMsg.Text) > previewCharCount { + if len(msg.OrigMsg.Text) > previewCharCount { untranslatedTextPreview = untranslatedTextPreview[:previewCharCount]+"..." } untranslatedTextPreview = strings.Replace(untranslatedTextPreview, "\n", " ", -1) - ch, err := b.getChannelByName(msg.TranslationSrcMsg.Channel) - time := strings.Split(msg.TranslationSrcMsg.ID, " ")[1] + ch, err := b.getChannelByName(msg.OrigMsg.Channel) + time := strings.Split(msg.OrigMsg.ID, " ")[1] params := slack.PermalinkParameters{ Channel: ch.ID, Ts: time, diff --git a/gateway/gateway.go b/gateway/gateway.go index 3559097e..ca9e50f6 100644 --- a/gateway/gateway.go +++ b/gateway/gateway.go @@ -276,6 +276,171 @@ func (*renderer) BlockCode(out *bytes.Buffer, text []byte, info string) { out.WriteString("\n") } +func (gw *Gateway) handleTranslation(msg *config.Message, dest *bridge.Bridge, channel config.ChannelInfo) { + msg.IsTranslation = true + ctx := context.Background() + + client := gw.Router.GTClient + defer client.Close() + + text := msg.Text + var results [][]string + + // colons: add temp token + // This is an ugly hack to work around what seems to be a bug in the Google Translate API. + // See: https://github.com/42wim/matterbridge/pull/512#issuecomment-428910199 + text = regexp.MustCompile(`(:)([ $])`).ReplaceAllString(text, "ː$2") + + // url + url_re := regexp.MustCompile(`(((http(s)?(\:\/\/))+(www\.)?([\w\-\.\/])*(\.[a-zA-Z]{2,3}\/?))[^\s\n|]*[^.,;:\?\!\@\^\$ -])`) + text = url_re.ReplaceAllString(text, "$0") + + flog.Debugf("pre-parseMD:"+text) + + // Get rid of these wierdo bullets that Slack uses, which confuse translation + text = strings.Replace(text, "•", "-", -1) + + // Make sure we use closed
tags + const htmlFlags = blackfriday.HTML_USE_XHTML + renderer := &renderer{Html: blackfriday.HtmlRenderer(htmlFlags, "", "").(*blackfriday.Html)} + const extensions = blackfriday.LINK_TYPE_NOT_AUTOLINK | + blackfriday.EXTENSION_HARD_LINE_BREAK | + blackfriday.EXTENSION_STRIKETHROUGH | + blackfriday.EXTENSION_FENCED_CODE | + blackfriday.EXTENSION_HARD_LINE_BREAK + output := blackfriday.Markdown([]byte(text), renderer, extensions) + text = string(output) + flog.Debugf("post-parseMD:"+string(output)) + + // @usernames + results = regexp.MustCompile(`(@[a-zA-Z0-9-]+)`).FindAllStringSubmatch(text, -1) + // Sort so that longest channel names are acted on first + sort.SliceStable(results, func(i, j int) bool { + return len(results[i][1]) > len(results[j][1]) + }) + for _, r := range results { + text = regexp.MustCompile(fmt.Sprintf(`([^>])(%s)`, r[1])).ReplaceAllString(text, "$1$2") + } + + flog.Debugf("post cleanup:usernames:"+text) + + // #channels + results = regexp.MustCompile(`(#[a-zA-Z0-9-]+)`).FindAllStringSubmatch(text, -1) + // Sort so that longest channel names are acted on first + sort.SliceStable(results, func(i, j int) bool { + return len(results[i][1]) > len(results[j][1]) + }) + for _, r := range results { + // If a channel that's a substring of another channel (processed earlier) matches, it will abort due to the in front + text = regexp.MustCompile(fmt.Sprintf(`([^>])(%s)`, r[1])).ReplaceAllString(text, "$1$2") + } + + flog.Debugf("post cleanup:channels:"+text) + + // :emoji: + text = regexp.MustCompile(`:[a-z0-9-_]+?:`).ReplaceAllString(text, "$0") + + // :emoji: codepoints, ie. 💎 + text = emoji.NewEmojiParser().ReplaceAllString(text, "$0") + + flog.Debugf("post cleanup:emojis:"+text) + + channelLang, err := language.Parse(channel.Options.Locale) + if err != nil { + flog.Error(err) + } + + resp, _ := client.Translate(ctx, []string{text}, channelLang, &translate.Options{ + Format: "html", + }) + + text = resp[0].Text + flog.Debugf("post-translate:"+text) + + if resp[0].Source != channelLang { + // If the source language is the same as this channel, + // just use the original text and don't add attribution + + // Add space buffer after html before stripping, or characters after tags get merged into urls or usernames + text = regexp.MustCompile(`.+?`).ReplaceAllString(text, " $0 ") + + allowableTags := []string{ + "p", + "i", + "b", + "em", + "strong", + "br", + "del", + "blockquote", + "pre", + "code", + "li", + "ul", + "ol", + } + + stripped, _ := htmltags.Strip(text, allowableTags, false) + text = stripped.ToString() + flog.Debugf("post-strip:"+text) + html2md.AddRule("del", &html2md.Rule{ + Patterns: []string{"del"}, + Replacement: func(innerHTML string, attrs []string) string { + if len(attrs) > 1 { + // Extra spaces so that Slack will process, even though Chinese characters don't get spaces + return html2md.WrapInlineTag(attrs[1], " ~", "~ ") + } + return "" + }, + }) + // Custom override for slackdown + html2md.AddRule("b", &html2md.Rule{ + Patterns: []string{"b", "strong"}, + Replacement: func(innerHTML string, attrs []string) string { + if len(attrs) > 1 { + // trailing whitespace due to Mandarin issues + return html2md.WrapInlineTag(attrs[1], "*", "* ") + } + return "" + }, + }) + // Custom override of default code rule: + // This converts multiline code tags to codeblocks + html2md.AddRule("code", &html2md.Rule{ + Patterns: []string{"code", "tt", "pre"}, + Replacement: func(innerHTML string, attrs []string) string { + contents := attrs[1] + if strings.Contains(contents, "\n") { + r := regexp.MustCompile(`/^\t+`) + innerHTML = r.ReplaceAllString(contents, " ") + return "\n\n```\n" + innerHTML + "```\n" + } + if len(attrs) > 1 { + return "`" + attrs[1] + "`" + } + return "" + }, + }) + text := html2md.Convert(text) + + // colons: revert temp token + // See: previous comment on colons + text = regexp.MustCompile(`(ː)([ $])`).ReplaceAllString(text, ":$2") + + flog.Debugf("post-MDconvert:"+text) + text = html.UnescapeString(text) + flog.Debugf("post-unescaped:"+text) + + if dest.Protocol == "slack" { + // Attribution will be in attachment for Slack + } else { + text = text + gw.Router.General.TranslationAttribution + } + + msg.Text = text + } +} + func (gw *Gateway) handleMessage(msg config.Message, dest *bridge.Bridge) []*BrMsgID { var brMsgIDs []*BrMsgID @@ -316,6 +481,10 @@ func (gw *Gateway) handleMessage(msg config.Message, dest *bridge.Bridge) []*BrM origmsg := msg channels := gw.getDestChannel(&msg, *dest) for _, channel := range channels { + msg.Text = origmsg.Text + msg.OrigMsg = &origmsg + msg.IsTranslation = false + // Only send the avatar download event to ourselves. if msg.Event == config.EVENT_AVATAR_DOWNLOAD { if channel.ID != getChannelID(origmsg) { @@ -332,176 +501,13 @@ func (gw *Gateway) handleMessage(msg config.Message, dest *bridge.Bridge) []*BrM msg.Avatar = gw.modifyAvatar(origmsg, dest) msg.Username = gw.modifyUsername(origmsg, dest) msg.ID = "" - // Don't keep artifacts in between processed translations - msg.Text = origmsg.Text - msg.TranslationSrcMsg = nil // Translation if (gw.Router.GTClient != nil) && (channel.Options.Locale != "") && (msg.Text != "") { - msg.TranslationSrcMsg = &origmsg - - ctx := context.Background() - - client := gw.Router.GTClient - defer client.Close() - - text := msg.Text - var results [][]string - - // colons: add temp token - // This is an ugly hack to work around what seems to be a bug in the Google Translate API. - // See: https://github.com/42wim/matterbridge/pull/512#issuecomment-428910199 - text = regexp.MustCompile(`(:)([ $])`).ReplaceAllString(text, "ː$2") - - // url - url_re := regexp.MustCompile(`(((http(s)?(\:\/\/))+(www\.)?([\w\-\.\/])*(\.[a-zA-Z]{2,3}\/?))[^\s\n|]*[^.,;:\?\!\@\^\$ -])`) - text = url_re.ReplaceAllString(text, "$0") - - flog.Debugf("pre-parseMD:"+text) - - // Get rid of these wierdo bullets that Slack uses, which confuse translation - text = strings.Replace(text, "•", "-", -1) - - // Make sure we use closed
tags - const htmlFlags = blackfriday.HTML_USE_XHTML - renderer := &renderer{Html: blackfriday.HtmlRenderer(htmlFlags, "", "").(*blackfriday.Html)} - const extensions = blackfriday.LINK_TYPE_NOT_AUTOLINK | - blackfriday.EXTENSION_HARD_LINE_BREAK | - blackfriday.EXTENSION_STRIKETHROUGH | - blackfriday.EXTENSION_FENCED_CODE | - blackfriday.EXTENSION_HARD_LINE_BREAK - output := blackfriday.Markdown([]byte(text), renderer, extensions) - text = string(output) - flog.Debugf("post-parseMD:"+string(output)) - - // @usernames - results = regexp.MustCompile(`(@[a-zA-Z0-9-]+)`).FindAllStringSubmatch(text, -1) - // Sort so that longest channel names are acted on first - sort.SliceStable(results, func(i, j int) bool { - return len(results[i][1]) > len(results[j][1]) - }) - for _, r := range results { - text = regexp.MustCompile(fmt.Sprintf(`([^>])(%s)`, r[1])).ReplaceAllString(text, "$1$2") - } - - flog.Debugf("post cleanup:usernames:"+text) - - // #channels - results = regexp.MustCompile(`(#[a-zA-Z0-9-]+)`).FindAllStringSubmatch(text, -1) - // Sort so that longest channel names are acted on first - sort.SliceStable(results, func(i, j int) bool { - return len(results[i][1]) > len(results[j][1]) - }) - for _, r := range results { - // If a channel that's a substring of another channel (processed earlier) matches, it will abort due to the in front - text = regexp.MustCompile(fmt.Sprintf(`([^>])(%s)`, r[1])).ReplaceAllString(text, "$1$2") - } - - flog.Debugf("post cleanup:channels:"+text) - - // :emoji: - text = regexp.MustCompile(`:[a-z0-9-_]+?:`).ReplaceAllString(text, "$0") - - // :emoji: codepoints, ie. 💎 - text = emoji.NewEmojiParser().ReplaceAllString(text, "$0") - - flog.Debugf("post cleanup:emojis:"+text) - - channelLang, err := language.Parse(channel.Options.Locale) - if err != nil { - flog.Error(err) - } - - resp, _ := client.Translate(ctx, []string{text}, channelLang, &translate.Options{ - Format: "html", - }) - - text = resp[0].Text - flog.Debugf("post-translate:"+text) - - if resp[0].Source != channelLang { - // If the source language is the same as this channel, - // just use the original text and don't add attribution - - // Add space buffer after html before stripping, or characters after tags get merged into urls or usernames - text = regexp.MustCompile(`.+?`).ReplaceAllString(text, " $0 ") - - allowableTags := []string{ - "p", - "i", - "b", - "em", - "strong", - "br", - "del", - "blockquote", - "pre", - "code", - "li", - "ul", - "ol", - } - - stripped, _ := htmltags.Strip(text, allowableTags, false) - text = stripped.ToString() - flog.Debugf("post-strip:"+text) - html2md.AddRule("del", &html2md.Rule{ - Patterns: []string{"del"}, - Replacement: func(innerHTML string, attrs []string) string { - if len(attrs) > 1 { - // Extra spaces so that Slack will process, even though Chinese characters don't get spaces - return html2md.WrapInlineTag(attrs[1], " ~", "~ ") - } - return "" - }, - }) - // Custom override for slackdown - html2md.AddRule("b", &html2md.Rule{ - Patterns: []string{"b", "strong"}, - Replacement: func(innerHTML string, attrs []string) string { - if len(attrs) > 1 { - // trailing whitespace due to Mandarin issues - return html2md.WrapInlineTag(attrs[1], "*", "* ") - } - return "" - }, - }) - // Custom override of default code rule: - // This converts multiline code tags to codeblocks - html2md.AddRule("code", &html2md.Rule{ - Patterns: []string{"code", "tt", "pre"}, - Replacement: func(innerHTML string, attrs []string) string { - contents := attrs[1] - if strings.Contains(contents, "\n") { - r := regexp.MustCompile(`/^\t+`) - innerHTML = r.ReplaceAllString(contents, " ") - return "\n\n```\n" + innerHTML + "```\n" - } - if len(attrs) > 1 { - return "`" + attrs[1] + "`" - } - return "" - }, - }) - text := html2md.Convert(text) - - // colons: revert temp token - // See: previous comment on colons - text = regexp.MustCompile(`(ː)([ $])`).ReplaceAllString(text, ":$2") - - flog.Debugf("post-MDconvert:"+text) - text = html.UnescapeString(text) - flog.Debugf("post-unescaped:"+text) - - if dest.Protocol == "slack" { - // Attribution will be in attachment for Slack - } else { - text = text + gw.Router.General.TranslationAttribution - } - - msg.Text = text - } + gw.handleTranslation(&msg, dest, channel) } + + if res, ok := gw.Messages.Get(origmsg.ID); ok { IDs := res.([]*BrMsgID) for _, id := range IDs {