diff options
Diffstat (limited to 'vendor/mvdan.cc/xurls')
-rw-r--r-- | vendor/mvdan.cc/xurls/v2/.gitattributes | 2 | ||||
-rw-r--r-- | vendor/mvdan.cc/xurls/v2/README.md | 8 | ||||
-rw-r--r-- | vendor/mvdan.cc/xurls/v2/schemes.go | 15 | ||||
-rw-r--r-- | vendor/mvdan.cc/xurls/v2/tlds.go | 14 | ||||
-rw-r--r-- | vendor/mvdan.cc/xurls/v2/unicode.go | 4 | ||||
-rw-r--r-- | vendor/mvdan.cc/xurls/v2/xurls.go | 140 |
6 files changed, 121 insertions, 62 deletions
diff --git a/vendor/mvdan.cc/xurls/v2/.gitattributes b/vendor/mvdan.cc/xurls/v2/.gitattributes new file mode 100644 index 000000000..6f9522992 --- /dev/null +++ b/vendor/mvdan.cc/xurls/v2/.gitattributes @@ -0,0 +1,2 @@ +# To prevent CRLF breakages on Windows for fragile files, like testdata. +* -text diff --git a/vendor/mvdan.cc/xurls/v2/README.md b/vendor/mvdan.cc/xurls/v2/README.md index 67f721ede..40b0cdc98 100644 --- a/vendor/mvdan.cc/xurls/v2/README.md +++ b/vendor/mvdan.cc/xurls/v2/README.md @@ -2,7 +2,7 @@ [](https://pkg.go.dev/mvdan.cc/xurls/v2) -Extract urls from text using regular expressions. Requires Go 1.15 or later. +Extract urls from text using regular expressions. Requires Go 1.16 or later. ```go import "mvdan.cc/xurls/v2" @@ -22,14 +22,14 @@ Since API is centered around [regexp.Regexp](https://golang.org/pkg/regexp/#Rege many other methods are available, such as finding the [byte indexes](https://golang.org/pkg/regexp/#Regexp.FindAllIndex) for all matches. -Note that calling the exposed functions means compiling a regular expression, so -repeated calls should be avoided. +The regular expressions are compiled when the API is first called. +Any subsequent calls will use the same regular expression pointers. #### cmd/xurls To install the tool globally: - cd $(mktemp -d); go mod init tmp; GO111MODULE=on go get mvdan.cc/xurls/v2/cmd/xurls + go install mvdan.cc/xurls/v2/cmd/xurls@latest ```shell $ echo "Do gophers live in http://golang.org?" | xurls diff --git a/vendor/mvdan.cc/xurls/v2/schemes.go b/vendor/mvdan.cc/xurls/v2/schemes.go index 5f6a80ae4..bff9862d9 100644 --- a/vendor/mvdan.cc/xurls/v2/schemes.go +++ b/vendor/mvdan.cc/xurls/v2/schemes.go @@ -23,6 +23,7 @@ var Schemes = []string{ `android`, `appdata`, `apt`, + `ar`, `ark`, `attachment`, `aw`, @@ -51,7 +52,6 @@ var Schemes = []string{ `com-eventbrite-attendee`, `content`, `content-type`, - `conti`, `crid`, `cvs`, `dab`, @@ -85,6 +85,7 @@ var Schemes = []string{ `fax`, `feed`, `feedready`, + `fido`, `file`, `filesystem`, `finger`, @@ -174,6 +175,7 @@ var Schemes = []string{ `ms-inputapp`, `ms-lockscreencomponent-config`, `ms-media-stream-id`, + `ms-meetnow`, `ms-mixedrealitycapture`, `ms-mobileplans`, `ms-officeapp`, @@ -209,6 +211,7 @@ var Schemes = []string{ `ms-settings-wifi`, `ms-settings-workplace`, `ms-spd`, + `ms-stickers`, `ms-sttoverlay`, `ms-transit-to`, `ms-useractivityset`, @@ -278,13 +281,14 @@ var Schemes = []string{ `sftp`, `sgn`, `shc`, - `shttp`, `sieve`, `simpleledger`, + `simplex`, `sip`, `sips`, `skype`, `smb`, + `smp`, `sms`, `smtp`, `snews`, @@ -300,8 +304,10 @@ var Schemes = []string{ `stun`, `stuns`, `submit`, - `swh`, `svn`, + `swh`, + `swid`, + `swidpath`, `tag`, `teamspeak`, `tel`, @@ -318,12 +324,13 @@ var Schemes = []string{ `tv`, `udp`, `unreal`, - `upt`, `urn`, `ut2004`, + `uuid-in-package`, `v-event`, `vemmi`, `ventrilo`, + `ves`, `videotex`, `vnc`, `view-source`, diff --git a/vendor/mvdan.cc/xurls/v2/tlds.go b/vendor/mvdan.cc/xurls/v2/tlds.go index 5b1ca5fd0..eb87f2789 100644 --- a/vendor/mvdan.cc/xurls/v2/tlds.go +++ b/vendor/mvdan.cc/xurls/v2/tlds.go @@ -34,7 +34,6 @@ var TLDs = []string{ `aero`, `aetna`, `af`, - `afamilycompany`, `afl`, `africa`, `ag`, @@ -186,7 +185,6 @@ var TLDs = []string{ `brussels`, `bs`, `bt`, - `budapest`, `bugatti`, `build`, `builders`, @@ -310,7 +308,6 @@ var TLDs = []string{ `crs`, `cruise`, `cruises`, - `csc`, `cu`, `cuisinella`, `cv`, @@ -369,7 +366,6 @@ var TLDs = []string{ `drive`, `dtv`, `dubai`, - `duck`, `dunlop`, `dupont`, `durban`, @@ -503,7 +499,6 @@ var TLDs = []string{ `gives`, `giving`, `gl`, - `glade`, `glass`, `gle`, `global`, @@ -667,6 +662,7 @@ var TLDs = []string{ `kh`, `ki`, `kia`, + `kids`, `kim`, `kinder`, `kindle`, @@ -729,7 +725,6 @@ var TLDs = []string{ `lipsy`, `live`, `living`, - `lixil`, `lk`, `llc`, `llp`, @@ -887,7 +882,6 @@ var TLDs = []string{ `nz`, `obi`, `observer`, - `off`, `office`, `okinawa`, `olayan`, @@ -987,10 +981,8 @@ var TLDs = []string{ `qpon`, `quebec`, `quest`, - `qvc`, `racing`, `radio`, - `raid`, `re`, `read`, `realestate`, @@ -1022,7 +1014,6 @@ var TLDs = []string{ `ril`, `rio`, `rip`, - `rmit`, `ro`, `rocher`, `rocks`, @@ -1068,7 +1059,6 @@ var TLDs = []string{ `schule`, `schwarz`, `science`, - `scjohnson`, `scot`, `sd`, `se`, @@ -1161,7 +1151,6 @@ var TLDs = []string{ `suzuki`, `sv`, `swatch`, - `swiftcover`, `swiss`, `sx`, `sy`, @@ -1470,7 +1459,6 @@ var TLDs = []string{ `嘉里`, `嘉里大酒店`, `在线`, - `大众汽车`, `大拿`, `天主教`, `娱乐`, diff --git a/vendor/mvdan.cc/xurls/v2/unicode.go b/vendor/mvdan.cc/xurls/v2/unicode.go index 40995e469..90dedba30 100644 --- a/vendor/mvdan.cc/xurls/v2/unicode.go +++ b/vendor/mvdan.cc/xurls/v2/unicode.go @@ -2,4 +2,6 @@ package xurls -const otherPuncMinusDoubleQuote = "!#%&'\\*,\\./:;\\?@\\\\¡§¶·¿;·՚՛՜՝՞՟։׀׃׆׳״؉؊،؍؛؞؟٪٫٬٭۔܀܁܂܃܄܅܆܇܈܉܊܋܌܍߷߸߹࠰࠱࠲࠳࠴࠵࠶࠷࠸࠹࠺࠻࠼࠽࠾࡞।॥॰৽੶૰౷಄෴๏๚๛༄༅༆༇༈༉༊་༌།༎༏༐༑༒༔྅࿐࿑࿒࿓࿔࿙࿚၊။၌၍၎၏჻፠፡።፣፤፥፦፧፨᙮᛫᛬᛭᜵᜶។៕៖៘៙៚᠀᠁᠂᠃᠄᠅᠇᠈᠉᠊᥄᥅᨞᨟᪠᪡᪢᪣᪤᪥᪦᪨᪩᪪᪫᪬᪭᭚᭛᭜᭝᭞᭟᭠᯼᯽᯾᯿᰻᰼᰽᰾᰿᱾᱿᳀᳁᳂᳃᳄᳅᳆᳇᳓‖‗†‡•‣․‥…‧‰‱′″‴‵‶‷‸※‼‽‾⁁⁂⁃⁇⁈⁉⁊⁋⁌⁍⁎⁏⁐⁑⁓⁕⁖⁗⁘⁙⁚⁛⁜⁝⁞⳹⳺⳻⳼⳾⳿⵰⸀⸁⸆⸇⸈⸋⸎⸏⸐⸑⸒⸓⸔⸕⸖⸘⸙⸛⸞⸟⸪⸫⸬⸭⸮⸰⸱⸲⸳⸴⸵⸶⸷⸸⸹⸼⸽⸾⸿⹁⹃⹄⹅⹆⹇⹈⹉⹊⹋⹌⹍⹎⹏⹒、。〃〽・꓾꓿꘍꘎꘏꙳꙾꛲꛳꛴꛵꛶꛷꡴꡵꡶꡷꣎꣏꣸꣹꣺꣼꤮꤯꥟꧁꧂꧃꧄꧅꧆꧇꧈꧉꧊꧋꧌꧍꧞꧟꩜꩝꩞꩟꫞꫟꫰꫱꯫︐︑︒︓︔︕︖︙︰﹅﹆﹉﹊﹋﹌﹐﹑﹒﹔﹕﹖﹗﹟﹠﹡﹨﹪﹫!"#%&'*,./:;?@\。、・𐄀𐄁𐄂𐎟𐏐𐕯𐡗𐤟𐤿𐩐𐩑𐩒𐩓𐩔𐩕𐩖𐩗𐩘𐩿𐫰𐫱𐫲𐫳𐫴𐫵𐫶𐬹𐬺𐬻𐬼𐬽𐬾𐬿𐮙𐮚𐮛𐮜𐽕𐽖𐽗𐽘𐽙𑁇𑁈𑁉𑁊𑁋𑁌𑁍𑂻𑂼𑂾𑂿𑃀𑃁𑅀𑅁𑅂𑅃𑅴𑅵𑇅𑇆𑇇𑇈𑇍𑇛𑇝𑇞𑇟𑈸𑈹𑈺𑈻𑈼𑈽𑊩𑑋𑑌𑑍𑑎𑑏𑑚𑑛𑑝𑓆𑗁𑗂𑗃𑗄𑗅𑗆𑗇𑗈𑗉𑗊𑗋𑗌𑗍𑗎𑗏𑗐𑗑𑗒𑗓𑗔𑗕𑗖𑗗𑙁𑙂𑙃𑙠𑙡𑙢𑙣𑙤𑙥𑙦𑙧𑙨𑙩𑙪𑙫𑙬𑜼𑜽𑜾𑠻𑥄𑥅𑥆𑧢𑨿𑩀𑩁𑩂𑩃𑩄𑩅𑩆𑪚𑪛𑪜𑪞𑪟𑪠𑪡𑪢𑱁𑱂𑱃𑱄𑱅𑱰𑱱𑻷𑻸𑿿𒑰𒑱𒑲𒑳𒑴𖩮𖩯𖫵𖬷𖬸𖬹𖬺𖬻𖭄𖺗𖺘𖺙𖺚𖿢𛲟𝪇𝪈𝪉𝪊𝪋𞥞𞥟" +const allowedUcsChar = "¡-ᙿᚁ-\u1fff\u200b-‧\u202a-\u202e‰-⁞\u2060-\u2fff、-\ud7ff豈-\ufdcfﷰ-\uffef𐀀-\U0001fffd𠀀-\U0002fffd𰀀-\U0003fffd\U00040000-\U0004fffd\U00050000-\U0005fffd\U00060000-\U0006fffd\U00070000-\U0007fffd\U00080000-\U0008fffd\U00090000-\U0009fffd\U000a0000-\U000afffd\U000b0000-\U000bfffd\U000c0000-\U000cfffd\U000d0000-\U000dfffd\U000e1000-\U000efffd" + +const allowedUcsCharMinusPunc = "¢-¦¨-µ¸-¾À-ͽͿ-ΆΈ-ՙՠ-ֈ֊-ֿׁ-ׂׄ-ׇׅ-ײ\u05f5-؈؋؎-ؚ\u061c-\u061dؠ-٩ٮ-ۓە-ۿ\u070e-߶ߺ-\u082f\u083f-\u085d\u085f-ॣ०-९ॱ-ৼ৾-ੵ\u0a77-૯૱-\u0c76౸-ಃಅ-ෳ\u0df5-๎๐-๙\u0e5c-༃༓༕-྄྆-࿏࿕-࿘\u0fdb-၉ၐ-ჺჼ-፟፩-᙭ᙯ-ᙿᚁ-ᛪᛮ-᜴\u1737-៓ៗ៛-\u17ff᠆᠋-\u1943᥆-\u1a1dᨠ-\u1a9fᪧ\u1aae-᭙᭡-\u1bfbᰀ-\u1c3a᱀-ᱽᲀ-Ჿ\u1cc8-᳔᳒-\u1fff\u200b-―‘-‟\u202a-\u202e‹-›‿-⁀⁄-⁆⁒⁔\u2060-\u2cf8⳽ⴀ-ⵯ\u2d71-ⷿ⸂-⸅⸉-⸊⸌-⸍⸗⸚⸜-⸝⸠-⸩ⸯ⸺-⸻⹀⹂⹐-⹑\u2e53-\u2fff〄-〼〾-ヺー-ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱\ua6f8-ꡳ\ua878-\ua8cd꣐-ꣷꣻꣽ-꤭ꤰ-\ua95eꥠ-꧀\ua9ce-\ua9ddꧠ-\uaa5bꩠ-ꫝꫠ-ꫯꫲ-ꯪ꯬-\ud7ff豈-\ufdcfﷰ-️︗-︘\ufe1a-︯︱-﹄﹇-﹈﹍-﹏\ufe53﹘-﹞﹢-\ufe67﹩\ufe6c-\uff00$(-)+-0-9<->A-[]-⦆「-」ヲ-\uffef𐀀-\U000100ff\U00010103-\U0001039e𐎠-𐏏𐏑-\U0001056e\U00010570-\U00010856𐡘-\U0001091e𐤠-\U0001093e\U00010940-\U00010a4f\U00010a59-𐩾𐪀-𐫯\U00010af7-\U00010b38𐭀-\U00010b98\U00010b9d-𐽔\U00010f5a-𑁆\U0001104e-𑂺\U000110bd\U000110c2-𑄿𑅄-𑅳𑅶-𑇄𑇉-𑇌𑇎-𑇚𑇜\U000111e0-𑈷𑈾-𑊨\U000112aa-𑑊𑑐-𑑙\U0001145c𑑞-𑓅𑓇-𑗀𑗘-𑙀𑙄-\U0001165f\U0001166d-𑜻𑜿-𑠺\U0001183c-𑥃\U00011947-𑧡𑧣-𑨾𑩇-𑪙𑪝\U00011aa3-𑱀\U00011c46-\U00011c6f𑱲-𑻶\U00011ef9-\U00011ffe𒀀-\U0001246f\U00012475-\U00016a6d\U00016a70-𖫴\U00016af6-𖬶𖬼-𖭃𖭅-𖺖\U00016e9b-𖿡𖿣-𛲞\U0001bca0-𝪆\U0001da8c-\U0001e95d\U0001e960-\U0001fffd𠀀-\U0002fffd𰀀-\U0003fffd\U00040000-\U0004fffd\U00050000-\U0005fffd\U00060000-\U0006fffd\U00070000-\U0007fffd\U00080000-\U0008fffd\U00090000-\U0009fffd\U000a0000-\U000afffd\U000b0000-\U000bfffd\U000c0000-\U000cfffd\U000d0000-\U000dfffd\U000e1000-\U000efffd" diff --git a/vendor/mvdan.cc/xurls/v2/xurls.go b/vendor/mvdan.cc/xurls/v2/xurls.go index 053e6436f..4113b07ba 100644 --- a/vendor/mvdan.cc/xurls/v2/xurls.go +++ b/vendor/mvdan.cc/xurls/v2/xurls.go @@ -7,6 +7,7 @@ package xurls import ( "regexp" "strings" + "sync" "unicode/utf8" ) @@ -15,32 +16,71 @@ import ( //go:generate go run ./generate/unicodegen const ( + // pathCont is based on https://www.rfc-editor.org/rfc/rfc3987#section-2.2 + // but does not match separators anywhere or most puncutation in final position, + // to avoid creating asymmetries like + // `Did you know that **<a href="...">https://example.com/**</a> is reserved for documentation?` + // from `Did you know that **https://example.com/** is reserved for documentation?`. + unreservedChar = `a-zA-Z0-9\-._~` + endUnreservedChar = `a-zA-Z0-9\-_~` + midSubDelimChar = `!$&'*+,;=` + endSubDelimChar = `$&+=` + midIPathSegmentChar = unreservedChar + `%` + midSubDelimChar + `:@` + allowedUcsChar + endIPathSegmentChar = endUnreservedChar + `%` + endSubDelimChar + allowedUcsCharMinusPunc + iPrivateChar = `\x{E000}-\x{F8FF}\x{F0000}-\x{FFFFD}\x{100000}-\x{10FFFD}` + midIChar = `/?#\\` + midIPathSegmentChar + iPrivateChar + endIChar = `/#` + endIPathSegmentChar + iPrivateChar + wellParen = `\((?:[` + midIChar + `]|\([` + midIChar + `]*\))*\)` + wellBrack = `\[(?:[` + midIChar + `]|\[[` + midIChar + `]*\])*\]` + wellBrace = `\{(?:[` + midIChar + `]|\{[` + midIChar + `]*\})*\}` + wellAll = wellParen + `|` + wellBrack + `|` + wellBrace + pathCont = `(?:[` + midIChar + `]*(?:` + wellAll + `|[` + endIChar + `]))+` + letter = `\p{L}` mark = `\p{M}` number = `\p{N}` iriChar = letter + mark + number - currency = `\p{Sc}` - otherSymb = `\p{So}` - endChar = iriChar + `/\-_+&~%=#` + currency + otherSymb - midChar = endChar + "_*" + otherPuncMinusDoubleQuote - wellParen = `\([` + midChar + `]*(\([` + midChar + `]*\)[` + midChar + `]*)*\)` - wellBrack = `\[[` + midChar + `]*(\[[` + midChar + `]*\][` + midChar + `]*)*\]` - wellBrace = `\{[` + midChar + `]*(\{[` + midChar + `]*\}[` + midChar + `]*)*\}` - wellAll = wellParen + `|` + wellBrack + `|` + wellBrace - pathCont = `([` + midChar + `]*(` + wellAll + `|[` + endChar + `])+)+` - - iri = `[` + iriChar + `]([` + iriChar + `\-]*[` + iriChar + `])?` - domain = `(` + iri + `\.)+` - octet = `(25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])` - ipv4Addr = `\b` + octet + `\.` + octet + `\.` + octet + `\.` + octet + `\b` - ipv6Addr = `([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:[0-9a-fA-F]{0,4}|:[0-9a-fA-F]{1,4})?|(:[0-9a-fA-F]{1,4}){0,2})|(:[0-9a-fA-F]{1,4}){0,3})|(:[0-9a-fA-F]{1,4}){0,4})|:(:[0-9a-fA-F]{1,4}){0,5})((:[0-9a-fA-F]{1,4}){2}|:(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])(\.(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])){3})|(([0-9a-fA-F]{1,4}:){1,6}|:):[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){7}:` - ipAddr = `(` + ipv4Addr + `|` + ipv6Addr + `)` - port = `(:[0-9]*)?` + iri = `[` + iriChar + `](?:[` + iriChar + `\-]*[` + iriChar + `])?` + subdomain = `(?:` + iri + `\.)+` + octet = `(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])` + ipv4Addr = octet + `\.` + octet + `\.` + octet + `\.` + octet + + // ipv6Addr is based on https://datatracker.ietf.org/doc/html/rfc4291#section-2.2 + // with a specific alternative for each valid count of leading 16-bit hexadecimal "chomps" + // that have not been replaced with a `::` elision. + h4 = `[0-9a-fA-F]{1,4}` + ipv6AddrMinusEmpty = `(?:` + + // 7 colon-terminated chomps, followed by a final chomp or the rest of an elision. + `(?:` + h4 + `:){7}(?:` + h4 + `|:)|` + + // 6 chomps, followed by an IPv4 address or elision with final chomp or final elision. + `(?:` + h4 + `:){6}(?:` + ipv4Addr + `|:` + h4 + `|:)|` + + // 5 chomps, followed by an elision with optional IPv4 or up to 2 final chomps. + `(?:` + h4 + `:){5}(?::` + ipv4Addr + `|(?::` + h4 + `){1,2}|:)|` + + // 4 chomps, followed by an elision with optional IPv4 (optionally preceded by a chomp) or + // up to 3 final chomps. + `(?:` + h4 + `:){4}(?:(?::` + h4 + `){0,1}:` + ipv4Addr + `|(?::` + h4 + `){1,3}|:)|` + + // 3 chomps, followed by an elision with optional IPv4 (preceded by up to 2 chomps) or + // up to 4 final chomps. + `(?:` + h4 + `:){3}(?:(?::` + h4 + `){0,2}:` + ipv4Addr + `|(?::` + h4 + `){1,4}|:)|` + + // 2 chomps, followed by an elision with optional IPv4 (preceded by up to 3 chomps) or + // up to 5 final chomps. + `(?:` + h4 + `:){2}(?:(?::` + h4 + `){0,3}:` + ipv4Addr + `|(?::` + h4 + `){1,5}|:)|` + + // 1 chomp, followed by an elision with optional IPv4 (preceded by up to 4 chomps) or + // up to 6 final chomps. + `(?:` + h4 + `:){1}(?:(?::` + h4 + `){0,4}:` + ipv4Addr + `|(?::` + h4 + `){1,6}|:)|` + + // elision, followed by optional IPv4 (preceded by up to 5 chomps) or + // up to 7 final chomps. + // `:` is an intentionally omitted alternative, to avoid matching `::`. + `:(?:(?::` + h4 + `){0,5}:` + ipv4Addr + `|(?::` + h4 + `){1,7})` + + `)` + ipv6Addr = `(?:` + ipv6AddrMinusEmpty + `|::)` + ipAddrMinusEmpty = `(?:` + ipv6AddrMinusEmpty + `|\b` + ipv4Addr + `\b)` + port = `(?::[0-9]*)?` ) // AnyScheme can be passed to StrictMatchingScheme to match any possibly valid // scheme, and not just the known ones. -var AnyScheme = `([a-zA-Z][a-zA-Z.\-+]*://|` + anyOf(SchemesNoAuthority...) + `:)` +var AnyScheme = `(?:[a-zA-Z][a-zA-Z.\-+]*://|` + anyOf(SchemesNoAuthority...) + `:)` // SchemesNoAuthority is a sorted list of some well-known url schemes that are // followed by ":" instead of "://". The list includes both officially @@ -62,17 +102,33 @@ var SchemesNoAuthority = []string{ // // Mostly collected from https://en.wikipedia.org/wiki/List_of_URI_schemes#Unofficial_but_common_URI_schemes. var SchemesUnofficial = []string{ - `jdbc`, // Java database Connectivity - `postgres`, // PostgreSQL (short form) - `postgresql`, // PostgreSQL - `slack`, // Slack - `zoommtg`, // Zoom (desktop) - `zoomus`, // Zoom (mobile) + `gemini`, // gemini + `jdbc`, // Java database Connectivity + `moz-extension`, // Firefox extension + `postgres`, // PostgreSQL (short form) + `postgresql`, // PostgreSQL + `slack`, // Slack + `zoommtg`, // Zoom (desktop) + `zoomus`, // Zoom (mobile) } +// The regular expressions are compiled when the API is first called. +// Any subsequent calls will use the same regular expression pointers. +// +// We do not need to make a copy of them for each API call, +// as Copy is now only useful if one copy calls Longest but not another, +// and we always call Longest after compiling the regular expression. +var ( + strictRe *regexp.Regexp + strictInit sync.Once + + relaxedRe *regexp.Regexp + relaxedInit sync.Once +) + func anyOf(strs ...string) string { var b strings.Builder - b.WriteByte('(') + b.WriteString("(?:") for i, s := range strs { if i != 0 { b.WriteByte('|') @@ -84,8 +140,8 @@ func anyOf(strs ...string) string { } func strictExp() string { - schemes := `((` + anyOf(Schemes...) + `|` + anyOf(SchemesUnofficial...) + `)://|` + anyOf(SchemesNoAuthority...) + `:)` - return `(?i)` + schemes + `(?-i)` + pathCont + schemes := `(?:(?i)(?:` + anyOf(Schemes...) + `|` + anyOf(SchemesUnofficial...) + `)://|` + anyOf(SchemesNoAuthority...) + `:)` + return schemes + pathCont } func relaxedExp() string { @@ -102,35 +158,39 @@ func relaxedExp() string { // Use \b to make sure ASCII TLDs are immediately followed by a word break. // We can't do that with unicode TLDs, as they don't see following // whitespace as a word break. - tlds := `(?i)(` + punycode + `|` + anyOf(append(asciiTLDs, PseudoTLDs...)...) + `\b|` + anyOf(unicodeTLDs...) + `)(?-i)` - site := domain + tlds + tlds := `(?:(?i)` + punycode + `|` + anyOf(append(asciiTLDs, PseudoTLDs...)...) + `\b|` + anyOf(unicodeTLDs...) + `)` + domain := subdomain + tlds - hostName := `(` + site + `|` + ipAddr + `)` - webURL := hostName + port + `(/|/` + pathCont + `)?` - email := `[a-zA-Z0-9._%\-+]+@` + site - return strictExp() + `|` + webURL + `|` + email + hostName := `(?:` + domain + `|\[` + ipv6Addr + `\]|\b` + ipv4Addr + `\b)` + webURL := hostName + port + `(?:/` + pathCont + `|/)?` + email := `[a-zA-Z0-9._%\-+]+@` + domain + return strictExp() + `|` + webURL + `|` + email + `|` + ipv6AddrMinusEmpty } // Strict produces a regexp that matches any URL with a scheme in either the // Schemes or SchemesNoAuthority lists. func Strict() *regexp.Regexp { - re := regexp.MustCompile(strictExp()) - re.Longest() - return re + strictInit.Do(func() { + strictRe = regexp.MustCompile(strictExp()) + strictRe.Longest() + }) + return strictRe } // Relaxed produces a regexp that matches any URL matched by Strict, plus any // URL with no scheme or email address. func Relaxed() *regexp.Regexp { - re := regexp.MustCompile(relaxedExp()) - re.Longest() - return re + relaxedInit.Do(func() { + relaxedRe = regexp.MustCompile(relaxedExp()) + relaxedRe.Longest() + }) + return relaxedRe } // StrictMatchingScheme produces a regexp similar to Strict, but requiring that // the scheme match the given regular expression. See AnyScheme too. func StrictMatchingScheme(exp string) (*regexp.Regexp, error) { - strictMatching := `(?i)(` + exp + `)(?-i)` + pathCont + strictMatching := `(?i)(?:` + exp + `)(?-i)` + pathCont re, err := regexp.Compile(strictMatching) if err != nil { return nil, err |