diff options
author | 2023-11-21 15:13:30 +0100 | |
---|---|---|
committer | 2023-11-21 15:13:30 +0100 | |
commit | cfefbc08d822cd85787d95dc2ee253e3368826d8 (patch) | |
tree | af6d6257dddca1645ab5f8e34a1c79ac80d82e0e /internal/typeutils | |
parent | [docs] Annotate split-domain setup (#2372) (diff) | |
download | gotosocial-cfefbc08d822cd85787d95dc2ee253e3368826d8.tar.xz |
[feature] Federate status language in and out (#2366)
* [feature] Federate status language in + out
* go fmt
* tests, little fix
* improve comments
* unnest a bit
* avoid unnecessary nil check
* use more descriptive variable for contentMap
* prefer instance languages when selecting from contentMap
* update docs to reflect lang selection
* rename rdfLangString -> rdfLangs
* update comments to mention Pollable
* iter through slice instead of map
Diffstat (limited to 'internal/typeutils')
-rw-r--r-- | internal/typeutils/astointernal.go | 15 | ||||
-rw-r--r-- | internal/typeutils/astointernal_test.go | 9 | ||||
-rw-r--r-- | internal/typeutils/internaltoas.go | 10 | ||||
-rw-r--r-- | internal/typeutils/internaltoas_test.go | 44 | ||||
-rw-r--r-- | internal/typeutils/util.go | 101 | ||||
-rw-r--r-- | internal/typeutils/util_test.go | 114 | ||||
-rw-r--r-- | internal/typeutils/wrap_test.go | 3 |
7 files changed, 273 insertions, 23 deletions
diff --git a/internal/typeutils/astointernal.go b/internal/typeutils/astointernal.go index 707f51629..c7908ad24 100644 --- a/internal/typeutils/astointernal.go +++ b/internal/typeutils/astointernal.go @@ -244,9 +244,15 @@ func (c *Converter) ASStatusToStatus(ctx context.Context, statusable ap.Statusab } // status.Content + // status.Language // - // The (html-formatted) content of this status. - status.Content = ap.ExtractContent(statusable) + // Many implementations set both content + // and contentMap; we can use these to + // infer the language of the status. + status.Content, status.Language = ContentToContentLanguage( + ctx, + ap.ExtractContent(statusable), + ) // status.Attachments // @@ -396,9 +402,6 @@ func (c *Converter) ASStatusToStatus(ctx context.Context, statusable ap.Statusab return &s }() - // language - // TODO: we might be able to extract this from the contentMap field - // ActivityStreamsType status.ActivityStreamsType = statusable.GetTypeName() @@ -707,7 +710,7 @@ func (c *Converter) ASFlagToReport(ctx context.Context, flaggable ap.Flaggable) // For Mastodon, this will just be a string, or nothing. // In Misskey's case, it may also contain the URLs of // one or more reported statuses, so extract these too. - content := ap.ExtractContent(flaggable) + content := ap.ExtractContent(flaggable).Content statusURIs := []*url.URL{} inlineURLs := misskeyReportInlineURLs(content) statusURIs = append(statusURIs, inlineURLs...) diff --git a/internal/typeutils/astointernal_test.go b/internal/typeutils/astointernal_test.go index 10ea422fa..851d57efc 100644 --- a/internal/typeutils/astointernal_test.go +++ b/internal/typeutils/astointernal_test.go @@ -45,6 +45,10 @@ func (suite *ASToInternalTestSuite) jsonToType(in string) vocab.Type { suite.FailNow(err.Error()) } + if statusable, ok := t.(ap.Statusable); ok { + ap.NormalizeIncomingContent(statusable, m) + } + return t } @@ -103,7 +107,8 @@ func (suite *ASToInternalTestSuite) TestParsePublicStatus() { suite.NoError(err) suite.Equal("reading: Punishment and Reward in the Corporate University", status.ContentWarning) - suite.Equal(`<p>> So we have to examine critical thinking as a signifier, dynamic and ambiguous. It has a normative definition, a tacit definition, and an ideal definition. One of the hallmarks of graduate training is learning to comprehend those definitions and applying the correct one as needed for professional success.</p>`, status.Content) + suite.Equal(`<p>> So we have to examine critical thinking as a signifier, dynamic and ambiguous. It has a normative definition, a tacit definition, and an ideal definition. One of the hallmarks of graduate training is learning to comprehend those definitions and applying the correct one as needed for professional success.</p>`, status.Content) + suite.Equal("en", status.Language) } func (suite *ASToInternalTestSuite) TestParsePublicStatusNoURL() { @@ -117,7 +122,7 @@ func (suite *ASToInternalTestSuite) TestParsePublicStatusNoURL() { suite.NoError(err) suite.Equal("reading: Punishment and Reward in the Corporate University", status.ContentWarning) - suite.Equal(`<p>> So we have to examine critical thinking as a signifier, dynamic and ambiguous. It has a normative definition, a tacit definition, and an ideal definition. One of the hallmarks of graduate training is learning to comprehend those definitions and applying the correct one as needed for professional success.</p>`, status.Content) + suite.Equal(`<p>> So we have to examine critical thinking as a signifier, dynamic and ambiguous. It has a normative definition, a tacit definition, and an ideal definition. One of the hallmarks of graduate training is learning to comprehend those definitions and applying the correct one as needed for professional success.</p>`, status.Content) // on statuses with no URL in them (like ones we get from pleroma sometimes) we should use the AP URI of the status as URL suite.Equal("http://fossbros-anonymous.io/users/foss_satan/statuses/108138763199405167", status.URL) diff --git a/internal/typeutils/internaltoas.go b/internal/typeutils/internaltoas.go index 16467be40..ff502296b 100644 --- a/internal/typeutils/internaltoas.go +++ b/internal/typeutils/internaltoas.go @@ -607,9 +607,17 @@ func (c *Converter) StatusToAS(ctx context.Context, s *gtsmodel.Status) (ap.Stat // conversation // TODO - // content -- the actual post itself + // content -- the actual post + // itself, plus the language contentProp := streams.NewActivityStreamsContentProperty() contentProp.AppendXMLSchemaString(s.Content) + + if s.Language != "" { + contentProp.AppendRDFLangString(map[string]string{ + s.Language: s.Content, + }) + } + status.SetActivityStreamsContent(contentProp) // attachments diff --git a/internal/typeutils/internaltoas_test.go b/internal/typeutils/internaltoas_test.go index 01dde66fb..878040dcc 100644 --- a/internal/typeutils/internaltoas_test.go +++ b/internal/typeutils/internaltoas_test.go @@ -340,6 +340,9 @@ func (suite *InternalToASTestSuite) TestStatusToAS() { "attributedTo": "http://localhost:8080/users/the_mighty_zork", "cc": "http://localhost:8080/users/the_mighty_zork/followers", "content": "hello everyone!", + "contentMap": { + "en": "hello everyone!" + }, "id": "http://localhost:8080/users/the_mighty_zork/statuses/01F8MHAMCHF6Y650WCRSCP4WMY", "published": "2021-10-20T12:40:37+02:00", "replies": { @@ -379,16 +382,21 @@ func (suite *InternalToASTestSuite) TestStatusWithTagsToASWithIDs() { // http://joinmastodon.org/ns, https://www.w3.org/ns/activitystreams -- // will appear, so trim them out of the string for consistency trimmed := strings.SplitAfter(string(bytes), `"attachment":`)[1] - suite.Equal(` { - "blurhash": "LNJRdVM{00Rj%Mayt7j[4nWBofRj", - "mediaType": "image/jpeg", - "name": "Black and white image of some 50's style text saying: Welcome On Board", - "type": "Document", - "url": "http://localhost:8080/fileserver/01F8MH17FWEB39HZJ76B6VXSKF/attachment/original/01F8MH6NEM8D7527KZAECTCR76.jpg" - }, + suite.Equal(` [ + { + "blurhash": "LNJRdVM{00Rj%Mayt7j[4nWBofRj", + "mediaType": "image/jpeg", + "name": "Black and white image of some 50's style text saying: Welcome On Board", + "type": "Document", + "url": "http://localhost:8080/fileserver/01F8MH17FWEB39HZJ76B6VXSKF/attachment/original/01F8MH6NEM8D7527KZAECTCR76.jpg" + } + ], "attributedTo": "http://localhost:8080/users/admin", "cc": "http://localhost:8080/users/admin/followers", "content": "hello world! #welcome ! first post on the instance :rainbow: !", + "contentMap": { + "en": "hello world! #welcome ! first post on the instance :rainbow: !" + }, "id": "http://localhost:8080/users/admin/statuses/01F8MH75CBF9JFX4ZAD54N0W0R", "published": "2021-10-20T11:36:45Z", "replies": { @@ -446,16 +454,21 @@ func (suite *InternalToASTestSuite) TestStatusWithTagsToASFromDB() { // http://joinmastodon.org/ns, https://www.w3.org/ns/activitystreams -- // will appear, so trim them out of the string for consistency trimmed := strings.SplitAfter(string(bytes), `"attachment":`)[1] - suite.Equal(` { - "blurhash": "LNJRdVM{00Rj%Mayt7j[4nWBofRj", - "mediaType": "image/jpeg", - "name": "Black and white image of some 50's style text saying: Welcome On Board", - "type": "Document", - "url": "http://localhost:8080/fileserver/01F8MH17FWEB39HZJ76B6VXSKF/attachment/original/01F8MH6NEM8D7527KZAECTCR76.jpg" - }, + suite.Equal(` [ + { + "blurhash": "LNJRdVM{00Rj%Mayt7j[4nWBofRj", + "mediaType": "image/jpeg", + "name": "Black and white image of some 50's style text saying: Welcome On Board", + "type": "Document", + "url": "http://localhost:8080/fileserver/01F8MH17FWEB39HZJ76B6VXSKF/attachment/original/01F8MH6NEM8D7527KZAECTCR76.jpg" + } + ], "attributedTo": "http://localhost:8080/users/admin", "cc": "http://localhost:8080/users/admin/followers", "content": "hello world! #welcome ! first post on the instance :rainbow: !", + "contentMap": { + "en": "hello world! #welcome ! first post on the instance :rainbow: !" + }, "id": "http://localhost:8080/users/admin/statuses/01F8MH75CBF9JFX4ZAD54N0W0R", "published": "2021-10-20T11:36:45Z", "replies": { @@ -519,6 +532,9 @@ func (suite *InternalToASTestSuite) TestStatusToASWithMentions() { "http://localhost:8080/users/the_mighty_zork" ], "content": "hi @the_mighty_zork welcome to the instance!", + "contentMap": { + "en": "hi @the_mighty_zork welcome to the instance!" + }, "id": "http://localhost:8080/users/admin/statuses/01FF25D5Q0DH7CHD57CTRS6WK0", "inReplyTo": "http://localhost:8080/users/the_mighty_zork/statuses/01F8MHAMCHF6Y650WCRSCP4WMY", "published": "2021-11-20T13:32:16Z", diff --git a/internal/typeutils/util.go b/internal/typeutils/util.go index a19588221..8a8d4123b 100644 --- a/internal/typeutils/util.go +++ b/internal/typeutils/util.go @@ -31,6 +31,8 @@ import ( apimodel "github.com/superseriousbusiness/gotosocial/internal/api/model" "github.com/superseriousbusiness/gotosocial/internal/config" "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" + "github.com/superseriousbusiness/gotosocial/internal/language" + "github.com/superseriousbusiness/gotosocial/internal/log" "github.com/superseriousbusiness/gotosocial/internal/regexes" "github.com/superseriousbusiness/gotosocial/internal/text" ) @@ -184,3 +186,102 @@ func placeholdUnknownAttachments(arr []apimodel.Attachment) (string, []apimodel. return text.SanitizeToHTML(aside.String()), arr } + +// ContentToContentLanguage tries to +// extract a content string and language +// tag string from the given intermediary +// content. +// +// Either/both of the returned strings may +// be empty, depending on how things go. +func ContentToContentLanguage( + ctx context.Context, + content gtsmodel.Content, +) ( + string, // content + string, // language +) { + var ( + contentStr string + langTagStr string + ) + + switch contentMap := content.ContentMap; { + // Simplest case: no `contentMap`. + // Return `content`, even if empty. + case contentMap == nil: + return content.Content, "" + + // `content` and `contentMap` set. + // Try to infer "primary" language. + case content.Content != "": + // Assume `content` is intended + // primary content, and look for + // corresponding language tag. + contentStr = content.Content + + for t, c := range contentMap { + if contentStr == c { + langTagStr = t + break + } + } + + // `content` not set; `contentMap` + // is set with only one value. + // This must be the "primary" lang. + case len(contentMap) == 1: + // Use an empty loop to + // get the values we want. + // nolint:revive + for langTagStr, contentStr = range contentMap { + } + + // Only `contentMap` is set, with more + // than one value. Map order is not + // guaranteed so we can't know the + // "primary" language. + // + // Try to select content using our + // instance's configured languages. + // + // In case of no hits, just take the + // first tag and content in the map. + default: + instanceLangs := config.GetInstanceLanguages() + for _, langTagStr = range instanceLangs.TagStrs() { + if contentStr = contentMap[langTagStr]; contentStr != "" { + // Hit! + break + } + } + + // If nothing found, just take + // the first entry we can get by + // breaking after the first iter. + if contentStr == "" { + for langTagStr, contentStr = range contentMap { + break + } + } + } + + if langTagStr != "" { + // Found a lang tag for this content, + // make sure it's valid / parseable. + lang, err := language.Parse(langTagStr) + if err != nil { + log.Warnf( + ctx, + "could not parse %s as BCP47 language tag in status contentMap: %v", + langTagStr, err, + ) + } else { + // Inferred the language! + // Use normalized version. + langTagStr = lang.TagStr + } + } + + return contentStr, langTagStr +} diff --git a/internal/typeutils/util_test.go b/internal/typeutils/util_test.go index e6610574b..0f852d399 100644 --- a/internal/typeutils/util_test.go +++ b/internal/typeutils/util_test.go @@ -18,7 +18,12 @@ package typeutils import ( + "context" "testing" + + "github.com/superseriousbusiness/gotosocial/internal/config" + "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" + "github.com/superseriousbusiness/gotosocial/internal/language" ) func TestMisskeyReportContentURLs1(t *testing.T) { @@ -44,3 +49,112 @@ misskey-formatted` t.Fatalf("wanted 0 urls, got %d", l) } } + +func TestContentToContentLanguage(t *testing.T) { + type testcase struct { + content gtsmodel.Content + instanceLanguages language.Languages + expectedContent string + expectedLang string + } + + ctx, cncl := context.WithCancel(context.Background()) + defer cncl() + + for i, testcase := range []testcase{ + { + content: gtsmodel.Content{ + Content: "hello world", + ContentMap: nil, + }, + expectedContent: "hello world", + expectedLang: "", + }, + { + content: gtsmodel.Content{ + Content: "", + ContentMap: map[string]string{ + "en": "hello world", + }, + }, + expectedContent: "hello world", + expectedLang: "en", + }, + { + content: gtsmodel.Content{ + Content: "bonjour le monde", + ContentMap: map[string]string{ + "en": "hello world", + "fr": "bonjour le monde", + }, + }, + expectedContent: "bonjour le monde", + expectedLang: "fr", + }, + { + content: gtsmodel.Content{ + Content: "bonjour le monde", + ContentMap: map[string]string{ + "en": "hello world", + }, + }, + expectedContent: "bonjour le monde", + expectedLang: "", + }, + { + content: gtsmodel.Content{ + Content: "", + ContentMap: map[string]string{ + "en": "hello world", + "ru": "Привет, мир!", + "nl": "hallo wereld!", + "ca": "Hola món!", + }, + }, + instanceLanguages: language.Languages{ + {TagStr: "en"}, + {TagStr: "ca"}, + }, + expectedContent: "hello world", + expectedLang: "en", + }, + { + content: gtsmodel.Content{ + Content: "", + ContentMap: map[string]string{ + "en": "hello world", + "ru": "Привет, мир!", + "nl": "hallo wereld!", + "ca": "Hola món!", + }, + }, + instanceLanguages: language.Languages{ + {TagStr: "ca"}, + {TagStr: "en"}, + }, + expectedContent: "Hola món!", + expectedLang: "ca", + }, + } { + langs, err := language.InitLangs(testcase.instanceLanguages.TagStrs()) + if err != nil { + t.Fatal(err) + } + config.SetInstanceLanguages(langs) + + content, language := ContentToContentLanguage(ctx, testcase.content) + if content != testcase.expectedContent { + t.Errorf( + "test %d expected content '%s' got '%s'", + i, testcase.expectedContent, content, + ) + } + + if language != testcase.expectedLang { + t.Errorf( + "test %d expected language '%s' got '%s'", + i, testcase.expectedLang, language, + ) + } + } +} diff --git a/internal/typeutils/wrap_test.go b/internal/typeutils/wrap_test.go index 9d6d95983..453073ed6 100644 --- a/internal/typeutils/wrap_test.go +++ b/internal/typeutils/wrap_test.go @@ -85,6 +85,9 @@ func (suite *WrapTestSuite) TestWrapNoteInCreate() { "attributedTo": "http://localhost:8080/users/the_mighty_zork", "cc": "http://localhost:8080/users/the_mighty_zork/followers", "content": "hello everyone!", + "contentMap": { + "en": "hello everyone!" + }, "id": "http://localhost:8080/users/the_mighty_zork/statuses/01F8MHAMCHF6Y650WCRSCP4WMY", "published": "2021-10-20T12:40:37+02:00", "replies": { |