package mailx import ( "bytes" "fmt" "regexp" "strings" "unicode/utf8" "pkg.jfrech.com/brief/internal/crumbs" "pkg.jfrech.com/brief/text" ) // Encode encodes a Field in a way to maximise both decodability of the exact // (including neighbouring whitespace) Field.Body by most mail agents as well // as maximise MUA passthrough. // // For a valid fieldname (nonempty, consisting of c>' '&&c!=':'&&c<='~') and // a valid fieldbody (properly UTF-8-encoded), Encode is injective. // // Encoded field may be folded. Encoded field includes a trailing CRLF. // // The empty fieldname is encoded as "-". Invalid fieldname characters are // replaced by '-'. Invalid fieldbody bytes are encoded as utf8.RuneError // (there is no non-character-semantics IANA charset so encoding true data in // mail field bodies is not possible in any context). // // Overlong atoms (> 76 octets) or overlong lines (> 80 octets, including CRLF) // may get emitted for whitespace-enclosed (counting left and right ends of // string as implicit whitespace) strings of the form /<[!-~]*@[!-~]*>/ to aid // in non-q-atom-compliant mail transfer agent's understanding of the message. // // Guaranteed to not emit lines longer than 1000 bytes (including CRLF) if // len(field.Name) <= 972 (equivalently, len(field.Name) + len(": ") + // + len("=?UTF-8?Q?=FF=FF=FF=FF?=") + len("\r\n") <= 1000). // // NOTE: When decoding mixed qatom/atom streams, whitespace is discarded // iff it is present between two qatoms. // [2023-11-13, jfrech] NOTE: An empty Q-encoded atom (e.g. "=?UTF-8?Q??=") // is invalid by RFC specifications. // [2023-11-22, jfrech] NOTE: Whitespace is kept between two atoms iff they // are not both q-atoms: // decode("a b") = "a b", // decode("a =?UTF-8?Q?b?=") = "a b", // decode("=?UTF-8?Q?a?= b") = "a b", // decode("=?UTF-8?Q?a?= =?UTF-8?Q?b?=") = "ab" func (field Field) Encode() []byte { // TODO maybe fusing is not wise: keeping atoms apart may aid in semantics-sensitive contexts const fuseQatoms = true if field.Body == "" { return []byte(field.fixedName() + ":" + CRLF) } isPlainAscii := func(s string) bool { if strings.Contains(s, "=?") || strings.Contains(s, "?=") { return false } else if len(s) == 0 || (len(s) > 0 && (text.IsHTSP(s[0]) || text.IsHTSP(s[len(s)-1]))) { return false } for _, c := range []byte(s) { if c == '\t' || c == ' ' { continue } if c < ' ' || c > '~' { return false } } return true } // looks like an e-mail address, if possible do not q-encode just to not // hit 80 char max encodingDiscouraged := func(text string) bool { return isPlainAscii(text) && regexp.MustCompile(`<.*@.*>`).MatchString(text) } hasWhitespacePrefix := func(s string) bool { return len(s) > 0 && text.IsHTSP(s[0]) } hasWhitespaceSuffix := func(s string) bool { return len(s) > 0 && text.IsHTSP(s[len(s)-1]) } forceValidUTF8 := func(text string) string { b := new(strings.Builder) for _, r := range text { // [2023-11-13, jfrech] NOTE: We can here even allow r=='\r' or // r=='\n' as they will correctly be q-encoded. // Thus, even r=='\x00' is permitted here. b.WriteRune(r) } return b.String() } // qesc encodes a single rune as qatom innards qesc := func(r rune) []byte { switch { // [2023-11-12, jfrech] Encoding ' ' as '_' is no longer // deemed too obscure. case r == ' ': return []byte{'_'} // only bytteratim-encode i) ASCII characters which ii) have no // qatom-special meaning and iii) do not confuse primitive e-mail // search queries case (r > ' ' && r <= '~') && (r != '=' && r != '?' && r != '_') && (r != '<' && r != '@' && r != '>'): return []byte{byte(r)} default: buf := new(bytes.Buffer) // TODO make global? utf8buf := make([]byte, 4) // TODO make global? for _, c := range utf8buf[:utf8.EncodeRune(utf8buf, r)] { fmt.Fprintf(buf, "=%02X", int(c)) } return buf.Bytes() } } type Part struct { Text string NeedsEncoding bool } var parts []Part for fb := forceValidUTF8(field.Body); len(fb) > 0; { switch { case text.IsHTSP(fb[0]): k := strings.IndexFunc(fb, func(r rune) bool { return !strings.ContainsRune(text.HTSP, r) }) if k == -1 { k = len(fb) } part := fb[:k] fb = fb[k:] // At least three consecutive whitespaces allow for a qatom // surrounded by whitespace: // "x y" becomes "x =?UTF-8?Q?_?= y" if len(part) >= 3 { parts = append(parts, Part{Text: string(part[0])}) parts = append(parts, Part{Text: part[1:len(part)-1], NeedsEncoding: true}) parts = append(parts, Part{Text: string(part[len(part)-1])}) } else { parts = append(parts, Part{Text: part}) } default: k := strings.IndexAny(fb, text.HTSP) if k == -1 { k = len(fb) } part := fb[:k] fb = fb[k:] parts = append(parts, Part{ Text: part, NeedsEncoding: !isPlainAscii(part), }) } } // leading whitespace if len(parts) > 0 && hasWhitespacePrefix(parts[0].Text) { parts[0].NeedsEncoding = true if len(parts) > 1 && len(parts[0].Text) > 1 { c := parts[0].Text[len(parts[0].Text)-1] parts[0].Text = parts[0].Text[:len(parts[0].Text)-1] parts[1].Text = string(c) + parts[1].Text } } // trailing whitespace if len(parts) > 0 && hasWhitespaceSuffix(parts[len(parts)-1].Text) { parts[len(parts)-1].NeedsEncoding = true if len(parts) > 1 && len(parts[len(parts)-1].Text) > 1 { c := parts[len(parts)-1].Text[0] parts[len(parts)-1].Text = parts[len(parts)-1].Text[1:] parts[len(parts)-2].Text = parts[len(parts)-2].Text + string(c) } } for j := range parts { if parts[j].Text == "" { panic("unreachable: empty part") } } // avoid Part{" ",false} (exactly two consecutive whitespace) for j := 1; j < len(parts)-1; j++ { if !parts[j].NeedsEncoding && len(parts[j].Text) == 2 && crumbs.All(text.IsHTSP, []byte(parts[j].Text)) { switch { case parts[j-1].NeedsEncoding: parts[j-1].Text = parts[j-1].Text + string(parts[j].Text[0]) parts[j].Text = parts[j].Text[1:] case parts[j+1].NeedsEncoding: parts[j+1].Text = string(parts[j].Text[1]) + parts[j+1].Text parts[j].Text = parts[j].Text[:len(parts[j].Text)-1] // " " should never occur, but the // behaviour is modeled after "Wukk ", where the right // part is kept pristine. default: fallthrough case encodingDiscouraged(parts[j+1].Text): parts[j-1].Text = parts[j-1].Text + string(parts[j].Text[0]) parts[j-1].NeedsEncoding = true parts[j].Text = parts[j].Text[1:] case encodingDiscouraged(parts[j-1].Text): parts[j+1].Text = string(parts[j].Text[1]) + parts[j+1].Text parts[j+1].NeedsEncoding = true parts[j].Text = parts[j].Text[:len(parts[j].Text)-1] } } } // force encoding for j := 1; j < len(parts); j++ { for i := 0; i < 2; i++ { // TODO ugly if parts[j-1].NeedsEncoding && !hasWhitespacePrefix(parts[j].Text) { parts[j].NeedsEncoding = true } if !hasWhitespaceSuffix(parts[j-1].Text) && parts[j].NeedsEncoding { parts[j-1].NeedsEncoding = true } } } // limits if true { if len(parts) > 0 && len(field.fixedName()) + len(": ") + len(parts[0].Text) + len(CRLF) > MaxLineOverlength { parts[0].NeedsEncoding = true } for j := range parts { if len(parts[j].Text) > MaxAtomLength && !encodingDiscouraged(parts[j].Text) { parts[j].NeedsEncoding = true } if len(" ") + len(parts[j].Text) + len(CRLF) > MaxLineOverlength { parts[j].NeedsEncoding = true } } } // save squeezed-in full whitespace // (e.g. "...?= =?..." loses its whitespace whilst e.g. "...?= f =?..." // keeps both left and right) for j := 1; j < len(parts)-1; j++ { if parts[j-1].NeedsEncoding && crumbs.All(text.IsHTSP, []byte(parts[j].Text)) && parts[j+1].NeedsEncoding { parts[j].NeedsEncoding = true } } if fuseQatoms { // fuse .NeedsEncoding for j := 1; j < len(parts); j++ { if parts[j-1].NeedsEncoding && parts[j].NeedsEncoding { parts[j-1].Text += parts[j].Text parts[j] = Part{} parts = append(parts[:j], parts[j+1:]...) j-- } } } for j := range parts { if parts[j].Text == "" { panic("unreachable: empty part") } } buf := new(bytes.Buffer) line := new(bytes.Buffer) line.Grow(MaxLineLength+1) line.WriteString(field.fixedName()) line.WriteString(": ") fold := func() { if line.Len() < 1 || !text.IsHTSP(line.Bytes()[line.Len()-1]) { panic("unreachable") } if buf.Len() == 0 && line.Len() == len(field.fixedName()) + len(": ") { return } c := line.Bytes()[line.Len()-1] line.Truncate(line.Len()-1) if line.Len() > 0 { line.WriteString(CRLF) buf.Write(line.Bytes()) } line.Reset() line.WriteByte(c) } for _, part := range parts { switch { default: atominnards := new(bytes.Buffer) for txt := part.Text; len(txt) > 0; { if bytes.HasSuffix(line.Bytes(), []byte("?=")) { line.WriteString(" ") } if line.Len() < 1 || !text.IsHTSP(line.Bytes()[line.Len()-1]) { panic("unreachable") } r0, _ := utf8.DecodeRuneInString(txt) if line.Len() + (len("=?UTF-8?Q?") + len(qesc(r0)) + len("?=")) + len(CRLF) > MaxLineLength { fold() } atominnards.Reset() for len(txt) > 0 { r, n := utf8.DecodeRuneInString(txt) if atominnards.Len() > 0 && line.Len() + (len("=?UTF-8?Q?") + atominnards.Len() + len(qesc(r)) + len("?=")) + len(CRLF) > MaxLineLength { break } txt = txt[n:] // NOTE: Even if always bytes.Equal(qesc(r),[]byte(byte(r))) holds, qencoding is still necessary to not invent whitespace around an atom. atominnards.Write(qesc(r)) } line.WriteString("=?UTF-8?Q?") line.Write(atominnards.Bytes()) atominnards.Reset() line.WriteString("?=") // only the very first line may be overlong if buf.Len() > 0 && line.Len() + len(CRLF) > MaxLineLength { panic("unreachable") } } case !part.NeedsEncoding && (len(part.Text) <= MaxLineLength-len(" ")-len(CRLF) || (encodingDiscouraged(part.Text) && len(part.Text) <= MaxLineOverlength-len(" ")-len(CRLF))): if line.Len() + len(part.Text) + len(CRLF) <= MaxLineLength { line.WriteString(part.Text) break } txt := part.Text if len(txt) <= 0 || line.Len() <= 0 { panic("unreachable") } if text.IsHTSP(txt[0]) { if text.IsHTSP(line.Bytes()[line.Len()-1]) { panic("unreachable") } line.WriteByte(txt[0]) txt = txt[1:] } fold() line.WriteString(txt) } } if line.Len() > 0 { line.WriteString(CRLF) buf.Write(line.Bytes()) line.Reset() } return buf.Bytes() }