From 0b84527eabceae7b3fe44aefb7223e76b8febe71 Mon Sep 17 00:00:00 2001 From: Tulir Asokan Date: Sat, 20 Aug 2022 10:26:03 +0300 Subject: [PATCH] Improve Discord markdown escaper. Fixes #14 --- formatter.go | 48 +++++++++++++++++++++++++++++---------- formatter_test.go | 57 +++++++++++++++++++++++++++++++++++++++++++++++ go.mod | 3 +++ 3 files changed, 96 insertions(+), 12 deletions(-) create mode 100644 formatter_test.go diff --git a/formatter.go b/formatter.go index f3dcef5..d94f35d 100644 --- a/formatter.go +++ b/formatter.go @@ -91,6 +91,40 @@ func pillConverter(displayname, mxid, eventID string, ctx format.Context) string return displayname } +// Discord links start with http:// or https://, contain at least two characters afterwards, +// don't contain < or whitespace anywhere, and don't end with "'),.:;] +// +// Zero-width whitespace is mostly in the Format category and is allowed, except \uFEFF isn't for some reason +var discordLinkRegex = regexp.MustCompile(`https?://[^<\p{Zs}\x{feff}]*[^"'),.:;\]\p{Zs}\x{feff}]`) + +var discordMarkdownEscaper = strings.NewReplacer( + `\`, `\\`, + `_`, `\_`, + `*`, `\*`, + `~`, `\~`, + "`", "\\`", + `|`, `\|`, + `<`, `\<`, +) + +func escapeDiscordMarkdown(s string) string { + submatches := discordLinkRegex.FindAllStringIndex(s, -1) + if submatches == nil { + return discordMarkdownEscaper.Replace(s) + } + var builder strings.Builder + offset := 0 + for _, match := range submatches { + start := match[0] + end := match[1] + builder.WriteString(discordMarkdownEscaper.Replace(s[offset:start])) + builder.WriteString(s[start:end]) + offset = end + } + builder.WriteString(discordMarkdownEscaper.Replace(s[offset:])) + return builder.String() +} + var matrixHTMLParser = &format.HTMLParser{ TabsToSpaces: 4, Newline: "\n", @@ -102,7 +136,7 @@ var matrixHTMLParser = &format.HTMLParser{ return fmt.Sprintf("__%s__", s) }, TextConverter: func(s string, context format.Context) string { - return discordMarkdownEscaper.Replace(s) + return escapeDiscordMarkdown(s) }, SpoilerConverter: func(text, reason string, ctx format.Context) string { if reason != "" { @@ -116,16 +150,6 @@ func init() { matrixHTMLParser.PillConverter = pillConverter } -var discordMarkdownEscaper = strings.NewReplacer( - `\`, `\\`, - `_`, `\_`, - `*`, `\*`, - `~`, `\~`, - "`", "\\`", - `|`, `\|`, - `<`, `\<`, -) - func (portal *Portal) parseMatrixHTML(user *User, content *event.MessageEventContent) string { if content.Format == event.FormatHTML && len(content.FormattedBody) > 0 { return matrixHTMLParser.Parse(content.FormattedBody, format.Context{ @@ -133,6 +157,6 @@ func (portal *Portal) parseMatrixHTML(user *User, content *event.MessageEventCon formatterContextPortalKey: portal, }) } else { - return discordMarkdownEscaper.Replace(content.Body) + return escapeDiscordMarkdown(content.Body) } } diff --git a/formatter_test.go b/formatter_test.go new file mode 100644 index 0000000..c05f95b --- /dev/null +++ b/formatter_test.go @@ -0,0 +1,57 @@ +// mautrix-discord - A Matrix-Discord puppeting bridge. +// Copyright (C) 2022 Tulir Asokan +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +package main + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestEscapeDiscordMarkdown(t *testing.T) { + type escapeTest struct { + name string + input string + expected string + } + + tests := []escapeTest{ + {"Simple text", "Lorem ipsum dolor sit amet, consectetuer adipiscing elit.", "Lorem ipsum dolor sit amet, consectetuer adipiscing elit."}, + {"Backslash", `foo\bar`, `foo\\bar`}, + {"Underscore", `foo_bar`, `foo\_bar`}, + {"Asterisk", `foo*bar`, `foo\*bar`}, + {"Tilde", `foo~bar`, `foo\~bar`}, + {"Backtick", "foo`bar", "foo\\`bar"}, + {"Forward tick", `foo´bar`, `foo´bar`}, + {"Pipe", `foo|bar`, `foo\|bar`}, + {"Less than", `foobar`, `foo>bar`}, + {"Multiple things", `\_*~|`, `\\\_\*\~\|`}, + {"URL", `https://example.com/foo_bar`, `https://example.com/foo_bar`}, + {"Multiple URLs", `hello_world https://example.com/foo_bar *testing* https://a_b_c/*def*`, `hello\_world https://example.com/foo_bar \*testing\* https://a_b_c/*def*`}, + {"URL ends with no-break zero-width space", "https://example.com\ufefffoo_bar", "https://example.com\ufefffoo\\_bar"}, + {"URL ends with less than", `https://example.com