summaryrefslogtreecommitdiff
path: root/internal/text
diff options
context:
space:
mode:
Diffstat (limited to 'internal/text')
-rw-r--r--internal/text/link.go115
-rw-r--r--internal/text/link_test.go155
-rw-r--r--internal/text/plain.go3
3 files changed, 273 insertions, 0 deletions
diff --git a/internal/text/link.go b/internal/text/link.go
new file mode 100644
index 000000000..440571a83
--- /dev/null
+++ b/internal/text/link.go
@@ -0,0 +1,115 @@
+/*
+ GoToSocial
+ Copyright (C) 2021 GoToSocial Authors admin@gotosocial.org
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+package text
+
+import (
+ "fmt"
+ "net/url"
+
+ "mvdan.cc/xurls/v2"
+)
+
+// schemes is the regex for schemes we accept when looking for links.
+// Basically, we accept https or http.
+var schemes = `(((http|https))://)`
+
+// FindLinks parses the given string looking for recognizable URLs (including scheme).
+// It returns a list of those URLs, without changing the string, or an error if something goes wrong.
+// If no URLs are found within the given string, an empty slice and nil will be returned.
+func FindLinks(in string) ([]*url.URL, error) {
+ rxStrict, err := xurls.StrictMatchingScheme(schemes)
+ if err != nil {
+ return nil, err
+ }
+
+ urls := []*url.URL{}
+
+ // bail already if we don't find anything
+ found := rxStrict.FindAllString(in, -1)
+ if len(found) == 0 {
+ return urls, nil
+ }
+
+ // for each string we find, we want to parse it into a URL if we can
+ // if we fail to parse it, just ignore this match and continue
+ for _, f := range found {
+ u, err := url.Parse(f)
+ if err != nil {
+ continue
+ }
+ urls = append(urls, u)
+ }
+
+ // deduplicate the URLs
+ urlsDeduped := []*url.URL{}
+
+ for _, u := range urls {
+ if !contains(urlsDeduped, u) {
+ urlsDeduped = append(urlsDeduped, u)
+ }
+ }
+
+ return urlsDeduped, nil
+}
+
+// contains checks if the given url is already within a slice of URLs
+func contains(urls []*url.URL, url *url.URL) bool {
+ for _, u := range urls {
+ if u.String() == url.String() {
+ return true
+ }
+ }
+ return false
+}
+
+// ReplaceLinks replaces all detected links in a piece of text with their HTML (href) equivalents.
+// Note: because Go doesn't allow negative lookbehinds in regex, it's possible that an already-formatted
+// href will end up double-formatted, if the text you pass here contains one or more hrefs already.
+// To avoid this, you should sanitize any HTML out of text before you pass it into this function.
+func ReplaceLinks(in string) string {
+ rxStrict, err := xurls.StrictMatchingScheme(schemes)
+ if err != nil {
+ panic(err)
+ }
+
+ replaced := rxStrict.ReplaceAllStringFunc(in, func(urlString string) string {
+ thisURL, err := url.Parse(urlString)
+ if err != nil {
+ return urlString // we can't parse it as a URL so don't replace it
+ }
+
+ shortString := thisURL.Hostname()
+
+ if thisURL.Path != "" {
+ shortString = shortString + thisURL.Path
+ }
+
+ if thisURL.Fragment != "" {
+ shortString = shortString + "#" + thisURL.Fragment
+ }
+
+ if thisURL.RawQuery != "" {
+ shortString = shortString + "?" + thisURL.RawQuery
+ }
+
+ replacement := fmt.Sprintf(`<a href="%s" rel="noopener">%s</a>`, urlString, shortString)
+ return replacement
+ })
+ return replaced
+}
diff --git a/internal/text/link_test.go b/internal/text/link_test.go
new file mode 100644
index 000000000..636f26f7f
--- /dev/null
+++ b/internal/text/link_test.go
@@ -0,0 +1,155 @@
+/*
+ GoToSocial
+ Copyright (C) 2021 GoToSocial Authors admin@gotosocial.org
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+package text_test
+
+import (
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/suite"
+ "github.com/superseriousbusiness/gotosocial/internal/text"
+)
+
+const text1 = `
+This is a text with some links in it. Here's link number one: https://example.org/link/to/something#fragment
+
+Here's link number two: http://test.example.org?q=bahhhhhhhhhhhh
+
+https://another.link.example.org/with/a/pretty/long/path/at/the/end/of/it
+
+really.cool.website <-- this one shouldn't be parsed as a link because it doesn't contain the scheme
+
+https://example.orghttps://google.com <-- this shouldn't work either, but it does?! OK
+`
+
+const text2 = `
+this is one link: https://example.org
+
+this is the same link again: https://example.org
+
+these should be deduplicated
+`
+
+const text3 = `
+here's a mailto link: mailto:whatever@test.org
+`
+
+const text4 = `
+two similar links:
+
+https://example.org
+
+https://example.org/test
+`
+
+const text5 = `
+what happens when we already have a link within an href?
+
+<a href="https://example.org">https://example.org</a>
+`
+
+type TextTestSuite struct {
+ suite.Suite
+}
+
+func (suite *TextTestSuite) TestParseURLsFromText1() {
+ urls, err := text.FindLinks(text1)
+
+ assert.NoError(suite.T(), err)
+
+ assert.Equal(suite.T(), "https://example.org/link/to/something#fragment", urls[0].String())
+ assert.Equal(suite.T(), "http://test.example.org?q=bahhhhhhhhhhhh", urls[1].String())
+ assert.Equal(suite.T(), "https://another.link.example.org/with/a/pretty/long/path/at/the/end/of/it", urls[2].String())
+ assert.Equal(suite.T(), "https://example.orghttps://google.com", urls[3].String())
+}
+
+func (suite *TextTestSuite) TestParseURLsFromText2() {
+ urls, err := text.FindLinks(text2)
+ assert.NoError(suite.T(), err)
+
+ // assert length 1 because the found links will be deduplicated
+ assert.Len(suite.T(), urls, 1)
+}
+
+func (suite *TextTestSuite) TestParseURLsFromText3() {
+ urls, err := text.FindLinks(text3)
+ assert.NoError(suite.T(), err)
+
+ // assert length 0 because `mailto:` isn't accepted
+ assert.Len(suite.T(), urls, 0)
+}
+
+func (suite *TextTestSuite) TestReplaceLinksFromText1() {
+ replaced := text.ReplaceLinks(text1)
+ assert.Equal(suite.T(), `
+This is a text with some links in it. Here's link number one: <a href="https://example.org/link/to/something#fragment" rel="noopener">example.org/link/to/something#fragment</a>
+
+Here's link number two: <a href="http://test.example.org?q=bahhhhhhhhhhhh" rel="noopener">test.example.org?q=bahhhhhhhhhhhh</a>
+
+<a href="https://another.link.example.org/with/a/pretty/long/path/at/the/end/of/it" rel="noopener">another.link.example.org/with/a/pretty/long/path/at/the/end/of/it</a>
+
+really.cool.website <-- this one shouldn't be parsed as a link because it doesn't contain the scheme
+
+<a href="https://example.orghttps://google.com" rel="noopener">example.orghttps//google.com</a> <-- this shouldn't work either, but it does?! OK
+`, replaced)
+}
+
+func (suite *TextTestSuite) TestReplaceLinksFromText2() {
+ replaced := text.ReplaceLinks(text2)
+ assert.Equal(suite.T(), `
+this is one link: <a href="https://example.org" rel="noopener">example.org</a>
+
+this is the same link again: <a href="https://example.org" rel="noopener">example.org</a>
+
+these should be deduplicated
+`, replaced)
+}
+
+func (suite *TextTestSuite) TestReplaceLinksFromText3() {
+ // we know mailto links won't be replaced with hrefs -- we only accept https and http
+ replaced := text.ReplaceLinks(text3)
+ assert.Equal(suite.T(), `
+here's a mailto link: mailto:whatever@test.org
+`, replaced)
+}
+
+func (suite *TextTestSuite) TestReplaceLinksFromText4() {
+ replaced := text.ReplaceLinks(text4)
+ assert.Equal(suite.T(), `
+two similar links:
+
+<a href="https://example.org" rel="noopener">example.org</a>
+
+<a href="https://example.org/test" rel="noopener">example.org/test</a>
+`, replaced)
+}
+
+func (suite *TextTestSuite) TestReplaceLinksFromText5() {
+ // we know this one doesn't work properly, which is why html should always be sanitized before being passed into the ReplaceLinks function
+ replaced := text.ReplaceLinks(text5)
+ assert.Equal(suite.T(), `
+what happens when we already have a link within an href?
+
+<a href="<a href="https://example.org" rel="noopener">example.org</a>"><a href="https://example.org" rel="noopener">example.org</a></a>
+`, replaced)
+}
+
+func TestTextTestSuite(t *testing.T) {
+ suite.Run(t, new(TextTestSuite))
+}
diff --git a/internal/text/plain.go b/internal/text/plain.go
index 24ef16f8e..4f6659484 100644
--- a/internal/text/plain.go
+++ b/internal/text/plain.go
@@ -28,6 +28,9 @@ import (
func (f *formatter) FromPlain(plain string, mentions []*gtsmodel.Mention, tags []*gtsmodel.Tag) string {
content := preformat(plain)
+ // format links nicely
+ content = ReplaceLinks(content)
+
// format mentions nicely
for _, menchie := range mentions {
targetAccount := &gtsmodel.Account{}