summaryrefslogtreecommitdiff
path: root/internal/text/normalize.go
diff options
context:
space:
mode:
authorLibravatar tobi <31960611+tsmethurst@users.noreply.github.com>2023-07-31 15:47:35 +0200
committerLibravatar GitHub <noreply@github.com>2023-07-31 15:47:35 +0200
commit2796a2e82f16ade9872008878cf88299bd66b4e7 (patch)
tree76f7b69cc1da57ca10b71c57abf1892575bea100 /internal/text/normalize.go
parent[performance] cache follow, follow request and block ID lists (#2027) (diff)
downloadgotosocial-2796a2e82f16ade9872008878cf88299bd66b4e7.tar.xz
[feature] Hashtag federation (in/out), hashtag client API endpoints (#2032)
* update go-fed * do the things * remove unused columns from tags * update to latest lingo from main * further tag shenanigans * serve stub page at tag endpoint * we did it lads * tests, oh tests, ohhh tests, oh tests (doo doo doo doo) * swagger docs * document hashtag usage + federation * instanceGet * don't bother parsing tag href * rename whereStartsWith -> whereStartsLike * remove GetOrCreateTag * dont cache status tag timelineability
Diffstat (limited to 'internal/text/normalize.go')
-rw-r--r--internal/text/normalize.go60
1 files changed, 60 insertions, 0 deletions
diff --git a/internal/text/normalize.go b/internal/text/normalize.go
new file mode 100644
index 000000000..14caf6311
--- /dev/null
+++ b/internal/text/normalize.go
@@ -0,0 +1,60 @@
+// GoToSocial
+// Copyright (C) GoToSocial Authors admin@gotosocial.org
+// SPDX-License-Identifier: AGPL-3.0-or-later
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+package text
+
+import (
+ "strings"
+
+ "github.com/superseriousbusiness/gotosocial/internal/util"
+ "golang.org/x/text/unicode/norm"
+)
+
+const (
+ maximumHashtagLength = 100
+)
+
+// NormalizeHashtag normalizes the given hashtag text by
+// removing the initial '#' symbol, and then decomposing
+// and canonically recomposing chars + combining diacritics
+// in the text to single unicode characters, following
+// Normalization Form C (https://unicode.org/reports/tr15/).
+//
+// Finally, it will do a check on the normalized string to
+// ensure that it's below maximumHashtagLength chars, and
+// contains only unicode letters and numbers. If this passes,
+// returned bool will be true.
+func NormalizeHashtag(text string) (string, bool) {
+ // This normalization is specifically to avoid cases
+ // where visually-identical hashtags are stored with
+ // different unicode representations (e.g. with combining
+ // diacritics). It allows a tasteful number of combining
+ // diacritics to be used, as long as they can be combined
+ // with parent characters to form regular letter symbols.
+ normalized := norm.NFC.String(strings.TrimPrefix(text, "#"))
+
+ // Validate normalized.
+ ok := true
+ for i, r := range normalized {
+ if i >= maximumHashtagLength || !util.IsPermittedInHashtag(r) {
+ ok = false
+ break
+ }
+ }
+
+ return normalized, ok
+}