package addressparsing import ( "bytes" "cmp" "fmt" "slices" "strings" ) // // one heuristic transforms // ..., " ", ... -> ..., "", "", ... // var homelessCommentFragments struct{} func Pretty(ps PaintedString) string { l0, l1 := new(bytes.Buffer), new(bytes.Buffer) l0.WriteByte('{') l1.WriteByte('{') for j := range len(ps.S) { if j >= len(ps.P) { // unreachable? l1.WriteByte('?') } else { switch ps.P[j] { default: l1.WriteByte('?') case Colourless: l1.WriteByte(' ') case Quote: l1.WriteByte('"') case Whitespace: l1.WriteByte('-') case Comment: l1.WriteByte('(') case ChevronA: l1.WriteByte('a') case ChevronB: l1.WriteByte('b') case Comma: l1.WriteByte(',') case Semicolon: l1.WriteByte(';') case PseudoChevron: l1.WriteByte('c') case Group: l1.WriteByte('g') case Ignore: l1.WriteByte('i') case At: l1.WriteByte('@') } } switch { case ps.S[j] >= ' ' && ps.S[j] != '\\' && ps.S[j] <= '~': l0.WriteByte(ps.S[j]) default: fmt.Fprintf(l0, "\\x%02x", int(ps.S[j])) for range 3 { l1.WriteByte(l1.Bytes()[l1.Len()-1]) } } } l0.WriteByte('}') l1.WriteByte('}') l0.WriteByte('\n') l1.WriteByte('\n') l0.Write(l1.Bytes()) return l0.String() } // modifies ps func Clear(ps PaintedString) { for j := range len(ps.S) { ps.P[j] = Colourless } } // modifies ps func PaintAt(ps PaintedString) { for j := range len(ps.S) { if ps.P[j] == Colourless && ps.S[j] == '@' { ps.P[j] = At } } } func DeleteComments(s string) string { ps := Paint(s) PaintQuote(ps) PaintComment(ps) b := new(strings.Builder) for j := range len(ps.S) { if ps.P[j] == Comment { continue } b.WriteByte(ps.S[j]) } return b.String() } // IsolateAddress tries to find a most likely part of s which represents // an e-mail address, isolates it by potentially removing enclosing chevrons, // and packs the remaining parts into name. // // Both name and address are quote-fixed, comment-depth-fixed and whitespace-trimmed, i.e.: // `< a @b .c > (()(x "` -> name=`(()(x ""))`, address=`a @b .c` // // NOTE(jfrech): 2024-06-01: Splitting "address" into {"local", "domain"} isn't pristine. // // "specials" are treated as separators, that is // ",@b" -> (",", "@b") // "()@b" -> ("()", "@b") // "d c@b" -> ("d", "c@b") // "a b . c . d @ e . f g" -> ("a g", "b . c . d @ e . f") // " g" -> ("g", "a b . c . d @ e . f") // "a " -> ("a", "b . c . d @ e . f g") // "ab" -> ("ab", "") // "ab@" -> ("", "ab@") // This is done so no subtle discrepencies with [List] arise. // // Cf. https://www.rfc-editor.org/rfc/rfc822.html#appendix-D [2024-06-05]: // atom = 1* // word = atom / quoted-string // specials = "(" / ")" / "<" / ">" / "@" ; Must be in quoted- // / "," / ";" / ":" / "\" / <"> ; string, to use // / "." / "[" / "]" ; within a word. func IsolateAddress(s string) (name, address string) { /* // BUG(jfrech): 2024-06-05: Minimally-invasive quoting is missing: e.g. `le <<@f` should become ("le", `"<"@f") and `le <<@f>` should become ("le", `"<"@f") // NOTE(jfrech): 2024-06-12: So that the parsing result is most usable! defer func() { // BUG(jfrech): 2024-06-12: Should this function itself do this? Would any user of the function benefit from the non-fixed name/address pair? name = FixUnquotedMarkers(FixQuotesAndComments(name)) address = FixUnquotedMarkers(FixQuotesAndComments(address)) }() */ ps := Paint(s) PaintQuote(ps) PaintWhitespace(ps) PaintComment(ps) PaintChevron(ps) PaintComma(ps) PaintSemicolon(ps) // NOTE(jfrech): 2024-06-01: So that ParseAddress and List don't have a subtle discrepency. Semicolon is only painted to act as a PseudoChevron barrier. PaintPseudoChevron(ps) //PaintGroup(ps) // not required //log.Printf("!\n%s", Pretty(ps)) findspan := func(j int) (l, r int) { l = j for { if l <= 0 || ps.P[l-1] != ps.P[j] { break } l-- } r = j for { if r >= len(ps.S)-1 || ps.P[r+1] != ps.P[j] { break } r++ } return } type Span struct{L, R, At int; Pseudo bool} // inclusive var spans []Span for _, k := range []Chroma{ChevronA, ChevronB, PseudoChevron} { for j := 0; j < len(ps.S); j++ { if !is(ps.P[j], k) { continue } l, r := findspan(j) /* if !is(ps.P[j], PseudoChevron) { if r-l >= 2 && ps.S[l] == '<' && ps.S[r] == '>' && r-l >= 2 { l-- r++ } } */ spans = append(spans, Span{ L: l, R: r, At: -1, // (later) Pseudo: is(k, PseudoChevron), }) j = r } } Clear(ps) PaintQuote(ps) PaintWhitespace(ps) PaintComment(ps) PaintAt(ps) outer: for j := range spans { spans[j].At = -1 // catch the right-most '@' for i := spans[j].R; i >= spans[j].L; i-- { if ps.P[i] == At { spans[j].At = i continue outer } } } // lower is better score1 := func(span Span) int8 { switch { case !span.Pseudo && span.At != -1: return -4 case span.Pseudo && span.At != -1: return -3 case !span.Pseudo && !(span.At != -1): return -2 case span.Pseudo && !(span.At != -1): return -1 default: panic("unreachable") } } compare2 := func(span, zpan Span) int { switch { case span == zpan: return 0 case span.L <= span.R && span.R < zpan.L && zpan.L <= zpan.R: return +1 // further back is better case zpan.L <= zpan.R && zpan.R < span.L && span.L <= span.R: return -1 // further forward is worse default: panic("unreachable") } } slices.SortStableFunc(spans, func(span, zpan Span) int { return cmp.Or( cmp.Compare(score1(span), score1(zpan)), compare2(span, zpan), ) }) /* Clear(ps) local, domain = "", "" if len(spans) > 0 { bLocal := new(strings.Builder) bDomain := new(strings.Builder) span := spans[0] spans = nil // mark everything (even dropped chevrons) as ``Ignore'' for i := span.L; i <= span.R; i++ { ps.P[i] = Ignore } // drop chevrons if !span.Pseudo && (span.R-span.L+1 >= 2 && ps.S[span.L] == '<' && ps.S[span.R] == '>') { span.L++ span.R-- // NOTE(jfrech): 2023-10-17: Observed a Microsoft MUA // emitting Message-Ids of the form "<[hex-base32@microsoft.com]>". // // drop brackets if span.R-span.L+1 >= 2 && ps.S[span.L] == '[' && ps.S[span.R] == ']' { span.L++ span.R-- } } for i := span.L; i <= span.R; i++ { switch { case span.At == -1: bLocal.WriteByte(ps.S[i]) case i < span.At: bLocal.WriteByte(ps.S[i]) case i == span.At: // ignore case i > span.At: bDomain.WriteByte(ps.S[i]) } } local = TrimWS(Unquote(DeleteComments(bLocal.String()))) domain = TrimWS(Unquote(DeleteComments(bDomain.String()))) } */ Clear(ps) address = "" if len(spans) > 0 { span := spans[0] // mark everything (even dropped chevrons) as ``Ignore'' for i := span.L; i <= span.R; i++ { ps.P[i] = Ignore } // drop chevrons if !span.Pseudo && (span.R-span.L+1 >= 2 && ps.S[span.L] == '<' && ps.S[span.R] == '>') { span.L++ span.R-- } address = TrimWS(FixQuotesAndComments(ps.S[span.L:span.R+1])) } b := new(strings.Builder) for j := range len(ps.S) { if ps.P[j] != Ignore { b.WriteByte(ps.S[j]) } } //name = TrimWS(Unquote(DeleteComments(b.String()))) // BUG(jfrech): 2024-06-02: Too much post-processing; TrimWS needs to respect open quotes! (`" ` ~> `" `!) name = TrimWS(FixQuotesAndComments(b.String())) return name, address }