package tequat import ( "bytes" "errors" "fmt" "io" "mime" "mime/multipart" "net/textproto" "sort" "strings" "pkg.jfrech.com/brief/decode" "pkg.jfrech.com/brief/mailx" "pkg.jfrech.com/brief/text" ) var ErrMIMENotYetImplemented = errors.New("MIME not yet implemented") var ErrMIMESemantics = errors.New("MIME semantics") var ErrMIMEAlienMediatypeParameters = errors.New("MIME alien media type parameters") var ErrMIMEUnknownMediaType = errors.New("MIME unknown media type") // TODO inelegantly implemented? // [2023-07-23, jfrech] TODO Inelegantly named! func ParseRaw(rawmsgr io.Reader) ([]mailx.Field, TQA, error) { var fields []mailx.Field contentheader := make(textproto.MIMEHeader) splitter := mailx.NewSplitter(text.ToCrlfReader(rawmsgr)) for { field, err := splitter.NextField() if errors.Is(err, mailx.EOH) { break } else if err != nil { return nil, nil, err } // TODO is this accurate? if strings.HasPrefix(field.CanonicalName(), textproto.CanonicalMIMEHeaderKey("Content-")) { contentheader.Add(field.Name, field.Body) } else { fields = append(fields, field) } } body := splitter // tqa, err := Parse(contentheader, body) return fields, tqa, err } func Parse(header textproto.MIMEHeader, r io.Reader) (TQA, error) { tqa, err := parse(parsecontext{}, mailx.CopyMIMEHeader(header), r) return tqa.Normalised(), err } // parsecontext has value semantics and all its methods are pure. type parsecontext struct { currentlyinline bool } func (pc parsecontext) notinline() parsecontext { pc.currentlyinline = false return pc } func (pc parsecontext) inline() parsecontext { pc.currentlyinline = true return pc } func parse(pc parsecontext, header textproto.MIMEHeader, r io.Reader) (TQA, error) { contenttype := header.Get("Content-Type") if len(header.Values("Content-Type")) > 1 { return nil, fmt.Errorf("%w: multiple Content-Type: %v", ErrMIMESemantics, header.Values("Content-Type")) } header.Del("Content-Type") if len(header.Values("Content-Transfer-Encoding")) > 1 { return nil, fmt.Errorf("%w: multiple Content-Transfer-Encoding: %v", ErrMIMESemantics, header.Values("Content-Transfer-Encoding")) } else { var err error r, err = decode.Transfer(r, header.Get("Content-Transfer-Encoding")) if err != nil { return nil, err } } header.Del("Content-Transfer-Encoding") // TODO Does mime.ParseMediaType error out on multiple values? t, params, err := mime.ParseMediaType(contenttype) if err != nil { return nil, fmt.Errorf("%w: %v", err, ErrMIMESemantics) } if strings.HasPrefix(t, "multipart/") || t == "multipart" { return parseMultipart(pc, t, params, header, r) } if strings.HasPrefix(t, "text/") || t == "text" { return parseText(pc, t, params, header, r) } switch t { case "image/jpeg", "image/png", "image/gif", "application/pdf", "application/octet-stream": data, err := io.ReadAll(r) if err != nil { return nil, err } header.Set("Content-Type", mime.FormatMediaType(t, params)) return []Atom{Attachment{ Inline: pc.currentlyinline, Header: header, Data: data, }}, nil // https://www.rfc-editor.org/rfc/rfc2046.html#section-5.1.7 default: return nil, fmt.Errorf("%w: %q", ErrMIMEUnknownMediaType, mime.FormatMediaType(t, params)) } } func parseText(pc parsecontext, t string, params map[string]string, header textproto.MIMEHeader, r io.Reader) (TQA, error) { if !strings.HasPrefix(t, "text/") && t != "text" { return nil, fmt.Errorf("not text/*: %q", t) } switch t { default: return nil, fmt.Errorf("%w: text/*: %q", ErrMIMEUnknownMediaType, mime.FormatMediaType(t, params)) case "text/plain": var err error r, err = decode.Charset(params["charset"], r) if err != nil { return nil, err } delete(params, "charset") r = decode.Flow(params["format"], params["delsp"], r) delete(params, "format") delete(params, "delsp") if len(params) != 0 { return nil, fmt.Errorf("%w: %q", ErrMIMEAlienMediatypeParameters, mime.FormatMediaType(t, params)) } return parseTextPlain(r) // TODO think about normalisation: text.Crlf2LfReader(r)) case "text/html": var err error r, err = decode.Charset(params["charset"], r) if err != nil { return nil, err } delete(params, "charset") if len(params) != 0 { return nil, fmt.Errorf("%w: %q", ErrMIMEAlienMediatypeParameters, mime.FormatMediaType(t, params)) } return parseTextHTML(r) } } func parseMultipart(pc parsecontext, t string, params map[string]string, header textproto.MIMEHeader, r io.Reader) (TQA, error) { if !strings.HasPrefix(t, "multipart/") && t != "multipart" { return nil, fmt.Errorf("not multipart/*: %q", t) } // modifies params multipartreader := func() (*multipart.Reader, error) { boundary := params["boundary"] if boundary == "" { return nil, fmt.Errorf("%w: %s: no boundary=", ErrMIMESemantics, t) } delete(params, "boundary") if len(params) != 0 { return nil, fmt.Errorf("%w: %q", ErrMIMEAlienMediatypeParameters, mime.FormatMediaType(t, params)) } return multipart.NewReader(r, boundary), nil } forallparts := func(f func(int, *multipart.Part) error) error { mpw, err := multipartreader() if err != nil { return err // TODO error prefix? } for pidx := 0; true; pidx++ { p, err := mpw.NextPart() if errors.Is(err, io.EOF) { return nil } else if err != nil { return fmt.Errorf("%s[%d]: %w", t, pidx, err) } err = f(pidx, p) if err != nil { return fmt.Errorf("%s[%d]: %w", t, pidx, err) } err = p.Close() if err != nil { return fmt.Errorf("%s[%d]: %w", t, pidx, err) } } panic("unreachable") // TODO remove this line } // RFC 1341, 7.2.2 // RFC 2046, 5.1.3. //case "multipart/mixed": // RFC 1341, 7.2.3 // https://www.rfc-editor.org/rfc/rfc2046.html#section-5.1.4 switch t { // "Any 'multipart' subtypes that an implementation does not recognize | must be treated as being of subtype 'mixed'." [RFC 2046, 5.1.3. [quotation marks altered]] default: return nil, fmt.Errorf("%w: unknown media type: %q", ErrMIMENotYetImplemented, mime.FormatMediaType(t, params)) // TODO remove fallthrough case "multipart/mixed": var atoms []Atom return TQA(atoms), forallparts(func(pidx int, p *multipart.Part) error { sub, err := parse(pc, p.Header, p) if err != nil { return err } atoms = append(atoms, sub...) return nil }) // https://www.rfc-editor.org/rfc/rfc2387 case "multipart/related": // https://www.rfc-editor.org/rfc/rfc2387#section-3.1 rootmediatype := params["type"] delete(params, "type") // TODO if _, ok := params["start"]; ok { return nil, fmt.Errorf("%w: %s; start=", ErrMIMENotYetImplemented, t) } else if _, ok := params["start-info"]; ok { return nil, fmt.Errorf("%w: %s; start-info=", ErrMIMENotYetImplemented, t) } delete(params, "start") delete(params, "start-info") var atoms []Atom return TQA(atoms), forallparts(func(pidx int, p *multipart.Part) error { isroot := pidx == 0 // TODO // TODO if p.Header.Get("Content-Type") == "" && isroot { p.Header.Set("Content-Type", rootmediatype) } sub, err := parse(pc, p.Header, p) if err != nil { return err } atoms = append(atoms, sub...) return nil }) case "multipart/alternative": // [2023-02-16, jfrech] This is a hotfix since many of my spam mails // for set a character set for multipart/alternative (which does not // make a lot of sense). if strings.ToLower(params["charset"]) == "utf-8" { delete(params, "charset") } type alternative struct { header textproto.MIMEHeader body []byte } var alternatives []alternative err := forallparts(func(_ int, p *multipart.Part) error { body, err := io.ReadAll(p) // TODO A tad inefficient, but p has no longevity (cf. https://cs.opensource.google/go/go/+/refs/tags/go1.20.1:src/mime/multipart/multipart.go;l=304;drc=b146d7626f869901f9dd841b9253e89a227c6465). alternatives = append(alternatives, alternative{mailx.CopyMIMEHeader(p.Header), body}) return err }) if err != nil { return nil, err } // alternatives[len(alternatives)-1] will be chosen sort.SliceStable(alternatives, func(i, j int) bool { key := map[string]int{ "text/html": 1, "text/plain": 2, } ai, aj := alternatives[i], alternatives[j] ti, _, _ := mime.ParseMediaType(ai.header.Get("Content-Type")) tj, _, _ := mime.ParseMediaType(aj.header.Get("Content-Type")) return key[ti] < key[tj] }) // [2023-02-15, jfrech] Perform a sanity check: some e-mails contain an // empty text/plain "alternative", whilst hiding the message in a // text/html part. for len(alternatives) > 1 && len(bytes.TrimSpace(alternatives[len(alternatives)-1].body)) == 0 { alternatives = alternatives[:len(alternatives)-1] } if len(alternatives) == 0 { /* header := make(textproto.MIMEHeader) header.Set("Content-Type", "text/plain; charset=utf-8") alternatives = append(alternatives, alternative{header, nil}) */ return nil, fmt.Errorf("%w: %s: empty", ErrMIMESemantics, t) } // "In general, the | best choice is the LAST part [...]" [RFC 2046, 5.1.4.] choice := alternatives[len(alternatives)-1] atoms, err := parse(pc.notinline(), choice.header, bytes.NewReader(choice.body)) if droppeds := alternatives[:len(alternatives)-1]; len(droppeds) > 0 { var droppedalternatives DroppedAlternatives for _, dropped := range droppeds { t, _, _ := mime.ParseMediaType(dropped.header.Get("Content-Type")) droppedalternatives.MediatypeTs = append(droppedalternatives.MediatypeTs, t) } atoms = append(atoms, droppedalternatives) } return atoms, err // RFC 1341, 7.2.4 //case "multipart/digest": // RFC 1341, 7.2.5 //case "multipart/parallel": } }