jhillyerd / enmime

MIME mail encoding and decoding package for Go

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Feature: Parse headers and stop

jhillyerd opened this issue · comments

Inbucket needs to read message headers during delivery, but doesn't need to parse the entire message until somebody tried to view it.

Go's built in header parsing doesn't handle encoded headers. It would be nice if enmime could just parse the headers and return them.

commented

@jhillyerd what type of encoding? Do you have a requirement to decode RFC2047 QP and B64 header values to plain-text?

commented

I wrote this a while back to preserve the ordinality of the headers with decoded values, however the resultant Decoded []byte value is for humans, ie- it's unicode and doesn't respect folding-whitespace or line-length requirements.

type HeadersPreserved struct {
	Decoded []byte
	Headers Headers
}

type Headers []Header

type Header struct {
	Name  string
	Value string
}

func Sort(b []byte) (*HeadersPreserved, error) {
	b = Clean(b)
	tr := textproto.NewReader(bufio.NewReader(bytes.NewReader(b)))
	headers, err := tr.ReadMIMEHeader()
	switch errors.Cause(err) {
	case nil, io.EOF:
	// carry on, io.EOF is expected
	default:
		return nil, err
	}
	bs := bufio.NewScanner(bufio.NewReader(bytes.NewReader(b)))
	res := Headers{}
	bw := &bytes.Buffer{}
	for bs.Scan() {
		line := bs.Text()
		if strings.HasPrefix(line, " ") || strings.HasPrefix(line, "\t") {
			continue
		}
		i := strings.Index(line, ":")
		if i == -1 {
			continue
		}
		header := textproto.CanonicalMIMEHeaderKey(line[:i])
		if len(headers[header]) == 0 {
			// somethings up, we should have already processed all of these, so why are we trying to fetch from an empty list, did we miscount?
			continue
		}
		// pop
		firstValue := headers[header][0]
		// shift
		headers[header] = headers[header][1:]

		h := Header{Name: header, Value: RFC2047parts(firstValue)}
		res = append(res, h)
		bw.WriteString(fmt.Sprintf("%s: %s\r\n", h.Name, h.Value))
	}
	bw.WriteString("\r\n")

	return &HeadersPreserved{
		Decoded: bw.Bytes(),
		Headers: res,
	}, nil
}

func Clean(b []byte) []byte {
	slice := bytes.SplitAfter(b, []byte{'\r', '\n'})
	dest := make([]byte, 0, len(b)+2)
	headers := true
	for _, v := range slice {
		if (bytes.Index(v, []byte{':'}) > -1 || bytes.HasPrefix(v, []byte{' '}) || bytes.HasPrefix(v, []byte{'\t'})) && headers {
			dest = append(dest, v...)
			continue
		}
		if headers {
			headers = false
			if !bytes.Equal(v, []byte{'\r', '\n'}) {
				dest = append(dest, append([]byte{'\r', '\n'}, v...)...)
				continue
			}
		}
		dest = append(dest, v...)
	}

	return dest
}

// RFC2047parts checks if the value contains content encoded in RFC2047 format
// RFC2047 Example:
//     `=?UTF-8?B?bmFtZT0iw7DCn8KUwoo=?=`
func RFC2047parts(s string) string {
	s = strings.Map(func(r rune) rune {
		if r == '\n' || r == '\r' {
			return ' '
		}
		return r
	}, s)
	var err error
	for {
		s, err = rfc2047recurse(s)
		switch err {
		case nil:
			continue
		default:
			return s
		}
	}
}

// rfc2047recurse is called for if the value contains content encoded in RFC2047 format and decodes it
// RFC2047 Example:
//     `=?UTF-8?B?bmFtZT0iw7DCn8KUwoo=?=`
func rfc2047recurse(s string) (string, error) {
	us := strings.ToUpper(s)
	if !strings.Contains(us, "?Q?") && !strings.Contains(us, "?B?") {
		return s, io.EOF
	}

	val, err := decodeHeader(s)
	if err != nil {
		return val, err
	}
	if val == s {
		val, err = decodeHeader(fixRFC2047String(val))
		if err != nil {
			return val, err
		}
		if val == s {
			return val, io.EOF
		}
	}

	return val, nil
}

// decodeHeader decodes a single line (per RFC 2047) using Golang's mime.WordDecoder
func decodeHeader(input string) (string, error) {
	dec := new(mime.WordDecoder)
	dec.CharsetReader = NewCharsetReader
	header, err := dec.DecodeHeader(input)
	if err != nil {
		return input, err
	}
	return header, nil
}

func fixRFC2047String(s string) string {
	inString := false
	eq := false
	q := 0
	sb := &strings.Builder{}
	for _, v := range s {
		switch v {
		case '=':
			if q == 3 {
				inString = false
			} else {
				eq = true
			}
			sb.WriteRune(v)
		case '?':
			if eq {
				inString = true
			} else {
				q += 1
			}
			eq = false
			sb.WriteRune(v)
		case '\n', '\r', ' ':
			if !inString {
				sb.WriteRune(v)
			}
			eq = false
		default:
			eq = false
			sb.WriteRune(v)
		}
	}
	return sb.String()
}

For the NewCharsetReader, just use the one in the enmime internal pkg...

Yes, essentially human readable decoding. All I really want is for enmime to do the exact same decoding it does now when building an Envelope, but then to stop before it starts trying to process the body of the email.

commented

@jhillyerd just let me know where you envision this being implemented and any special rules or strictures for output formatting. Got some free cycles for a week or so.

So the exact problem I'm trying to solve is here:

https://github.com/inbucket/inbucket/blob/master/pkg/message/manager.go#L53

I parse an entire email with enmime, but all I care about in that scenario is the From, To and Subject in UTF-8 from the primary header.

Returning the Envelope struct isn't mandatory. Let me know if that clarifies things.