-
-
Notifications
You must be signed in to change notification settings - Fork 34.1k
gh-144156: Fix email header folding concatenating encoded words #144692
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
b5925e0
1a7824e
fb23a6e
050491c
ed6f197
91b4b3b
d5f5001
15e6618
902ec59
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -80,7 +80,8 @@ | |||||||||||||
| # Useful constants and functions | ||||||||||||||
| # | ||||||||||||||
|
|
||||||||||||||
| WSP = set(' \t') | ||||||||||||||
| _WSP = ' \t' | ||||||||||||||
| WSP = set(_WSP) | ||||||||||||||
| CFWS_LEADER = WSP | set('(') | ||||||||||||||
| SPECIALS = set(r'()<>@,:;.\"[]') | ||||||||||||||
| ATOM_ENDS = SPECIALS | WSP | ||||||||||||||
|
|
@@ -2835,6 +2836,35 @@ def _steal_trailing_WSP_if_exists(lines): | |||||||||||||
| lines.pop() | ||||||||||||||
| return wsp | ||||||||||||||
|
|
||||||||||||||
|
|
||||||||||||||
| def _steal_all_trailing_WSP_if_exists(lines): | ||||||||||||||
| lines_popped = False | ||||||||||||||
| wsp_lines = [] | ||||||||||||||
| while lines and lines[-1]: | ||||||||||||||
| for i in range(len(lines[-1]), -1, -1): | ||||||||||||||
| if i <= 0 or lines[-1][i - 1] not in WSP: | ||||||||||||||
| break | ||||||||||||||
| wsp_line = lines[-1][i:] | ||||||||||||||
| if not wsp_line: | ||||||||||||||
| break | ||||||||||||||
| wsp_lines.insert(0, wsp_line) | ||||||||||||||
| lines[-1] = lines[-1][:i] | ||||||||||||||
| if not lines[-1]: | ||||||||||||||
| lines_popped = True | ||||||||||||||
| lines.pop() | ||||||||||||||
| else: | ||||||||||||||
| break | ||||||||||||||
| if lines_popped: | ||||||||||||||
| lines.append(' ' if lines else '') | ||||||||||||||
| return ''.join(wsp_lines) | ||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this is necessary, see my suggestion below. |
||||||||||||||
|
|
||||||||||||||
|
|
||||||||||||||
| def _last_word_is_sill_ew(_last_word_is_ew, added_str): | ||||||||||||||
| # If the last word is an encoded word, and the added string is all WSP, | ||||||||||||||
| # then (and only then) is the last word is still an encoded word. | ||||||||||||||
| return _last_word_is_ew and not bool(added_str.strip(_WSP)) | ||||||||||||||
|
|
||||||||||||||
|
|
||||||||||||||
| def _refold_parse_tree(parse_tree, *, policy): | ||||||||||||||
| """Return string of contents of parse_tree folded according to RFC rules. | ||||||||||||||
|
|
||||||||||||||
|
|
@@ -2843,11 +2873,9 @@ def _refold_parse_tree(parse_tree, *, policy): | |||||||||||||
| maxlen = policy.max_line_length or sys.maxsize | ||||||||||||||
| encoding = 'utf-8' if policy.utf8 else 'us-ascii' | ||||||||||||||
| lines = [''] # Folded lines to be output | ||||||||||||||
| leading_whitespace = '' # When we have whitespace between two encoded | ||||||||||||||
| # words, we may need to encode the whitespace | ||||||||||||||
| # at the beginning of the second word. | ||||||||||||||
| last_ew = None # Points to the last encoded character if there's an ew on | ||||||||||||||
| # the line | ||||||||||||||
| last_word_is_ew = False | ||||||||||||||
| last_ew = None # if there is an encoded word in the last line of lines, | ||||||||||||||
| # points to the encoded word's first character | ||||||||||||||
| last_charset = None | ||||||||||||||
| wrap_as_ew_blocked = 0 | ||||||||||||||
| want_encoding = False # This is set to True if we need to encode this part | ||||||||||||||
|
|
@@ -2882,6 +2910,7 @@ def _refold_parse_tree(parse_tree, *, policy): | |||||||||||||
| if part.token_type == 'mime-parameters': | ||||||||||||||
| # Mime parameter folding (using RFC2231) is extra special. | ||||||||||||||
| _fold_mime_parameters(part, lines, maxlen, encoding) | ||||||||||||||
| last_word_is_ew = False | ||||||||||||||
| continue | ||||||||||||||
|
|
||||||||||||||
| if want_encoding and not wrap_as_ew_blocked: | ||||||||||||||
|
|
@@ -2898,6 +2927,7 @@ def _refold_parse_tree(parse_tree, *, policy): | |||||||||||||
| # XXX what if encoded_part has no leading FWS? | ||||||||||||||
| lines.append(newline) | ||||||||||||||
| lines[-1] += encoded_part | ||||||||||||||
| last_word_is_ew = False | ||||||||||||||
| continue | ||||||||||||||
| # Either this is not a major syntactic break, so we don't | ||||||||||||||
| # want it on a line by itself even if it fits, or it | ||||||||||||||
|
|
@@ -2916,11 +2946,16 @@ def _refold_parse_tree(parse_tree, *, policy): | |||||||||||||
| (last_charset == 'unknown-8bit' or | ||||||||||||||
| last_charset == 'utf-8' and charset != 'us-ascii')): | ||||||||||||||
| last_ew = None | ||||||||||||||
| last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew, | ||||||||||||||
| part.ew_combine_allowed, charset, leading_whitespace) | ||||||||||||||
| # This whitespace has been added to the lines in _fold_as_ew() | ||||||||||||||
| # so clear it now. | ||||||||||||||
| leading_whitespace = '' | ||||||||||||||
| last_ew = _fold_as_ew( | ||||||||||||||
| tstr, | ||||||||||||||
| lines, | ||||||||||||||
| maxlen, | ||||||||||||||
| last_ew, | ||||||||||||||
| part.ew_combine_allowed, | ||||||||||||||
| charset, | ||||||||||||||
| last_word_is_ew, | ||||||||||||||
| ) | ||||||||||||||
| last_word_is_ew = True | ||||||||||||||
| last_charset = charset | ||||||||||||||
| want_encoding = False | ||||||||||||||
| continue | ||||||||||||||
|
|
@@ -2933,28 +2968,20 @@ def _refold_parse_tree(parse_tree, *, policy): | |||||||||||||
|
|
||||||||||||||
| if len(tstr) <= maxlen - len(lines[-1]): | ||||||||||||||
| lines[-1] += tstr | ||||||||||||||
| last_word_is_ew = _last_word_is_sill_ew(last_word_is_ew, tstr) | ||||||||||||||
| continue | ||||||||||||||
|
|
||||||||||||||
| # This part is too long to fit. The RFC wants us to break at | ||||||||||||||
| # "major syntactic breaks", so unless we don't consider this | ||||||||||||||
| # to be one, check if it will fit on the next line by itself. | ||||||||||||||
| leading_whitespace = '' | ||||||||||||||
| if (part.syntactic_break and | ||||||||||||||
| len(tstr) + 1 <= maxlen): | ||||||||||||||
| newline = _steal_trailing_WSP_if_exists(lines) | ||||||||||||||
| if newline or part.startswith_fws(): | ||||||||||||||
| # We're going to fold the data onto a new line here. Due to | ||||||||||||||
| # the way encoded strings handle continuation lines, we need to | ||||||||||||||
| # be prepared to encode any whitespace if the next line turns | ||||||||||||||
| # out to start with an encoded word. | ||||||||||||||
| lines.append(newline + tstr) | ||||||||||||||
|
|
||||||||||||||
| whitespace_accumulator = [] | ||||||||||||||
| for char in lines[-1]: | ||||||||||||||
| if char not in WSP: | ||||||||||||||
| break | ||||||||||||||
| whitespace_accumulator.append(char) | ||||||||||||||
| leading_whitespace = ''.join(whitespace_accumulator) | ||||||||||||||
| last_word_is_ew = _last_word_is_sill_ew( | ||||||||||||||
| last_word_is_ew, lines[-1] | ||||||||||||||
| ) | ||||||||||||||
| last_ew = None | ||||||||||||||
| continue | ||||||||||||||
| if not hasattr(part, 'encode'): | ||||||||||||||
|
|
@@ -2994,10 +3021,11 @@ def _refold_parse_tree(parse_tree, *, policy): | |||||||||||||
| else: | ||||||||||||||
| # We can't fold it onto the next line either... | ||||||||||||||
| lines[-1] += tstr | ||||||||||||||
| last_word_is_ew = _last_word_is_sill_ew(last_word_is_ew, tstr) | ||||||||||||||
|
|
||||||||||||||
| return policy.linesep.join(lines) + policy.linesep | ||||||||||||||
|
|
||||||||||||||
| def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, leading_whitespace): | ||||||||||||||
| def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, last_word_is_ew): | ||||||||||||||
| """Fold string to_encode into lines as encoded word, combining if allowed. | ||||||||||||||
| Return the new value for last_ew, or None if ew_combine_allowed is False. | ||||||||||||||
|
|
||||||||||||||
|
|
@@ -3012,14 +3040,21 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, | |||||||||||||
| to_encode = str( | ||||||||||||||
| get_unstructured(lines[-1][last_ew:] + to_encode)) | ||||||||||||||
| lines[-1] = lines[-1][:last_ew] | ||||||||||||||
| elif to_encode[0] in WSP: | ||||||||||||||
| elif to_encode[0] in WSP and not last_word_is_ew: | ||||||||||||||
| # We're joining this to non-encoded text, so don't encode | ||||||||||||||
| # the leading blank. | ||||||||||||||
| leading_wsp = to_encode[0] | ||||||||||||||
| to_encode = to_encode[1:] | ||||||||||||||
| if (len(lines[-1]) == maxlen): | ||||||||||||||
| lines.append(_steal_trailing_WSP_if_exists(lines)) | ||||||||||||||
| lines[-1] += leading_wsp | ||||||||||||||
| elif last_word_is_ew: | ||||||||||||||
| # If we are following up an encoded word with another encoded word, | ||||||||||||||
| # any white space between the two will be ignored when decoded. | ||||||||||||||
| # Therefore, we encode all to-be-displayed whitespace in the second | ||||||||||||||
| # encoded word. | ||||||||||||||
| leading_whitespace = _steal_all_trailing_WSP_if_exists(lines) | ||||||||||||||
| to_encode = leading_whitespace + to_encode | ||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
This avoids that complicated single use function. We should never be producing lines that contain only blanks, they get encoded. You can also move this up to be the first elif and drop the 'and not last_word_is_ew'.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
While testing, I noticed, too, that wrapped spaces are always encoded. TBH, I didn't like that and wanted to come up with a solution that's robust to changing this behavior in the future. I didn't like it because reading the RFC and the rest of this code it sure seems like not encoding words is generally preferable because of its human readability (and to a lesser extent mail clients that don't support encoded words). Further, I thought assuming an invariant that's enforced a few hundred lines away in a different function is a nice recipe for spooky action at a distance. If you anyway think the fix should assume this invariant, I'll change this part. |
||||||||||||||
|
|
||||||||||||||
| trailing_wsp = '' | ||||||||||||||
| if to_encode[-1] in WSP: | ||||||||||||||
|
|
@@ -3040,20 +3075,11 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, | |||||||||||||
|
|
||||||||||||||
| while to_encode: | ||||||||||||||
| remaining_space = maxlen - len(lines[-1]) | ||||||||||||||
| text_space = remaining_space - chrome_len - len(leading_whitespace) | ||||||||||||||
| text_space = remaining_space - chrome_len | ||||||||||||||
| if text_space <= 0: | ||||||||||||||
| lines.append(' ') | ||||||||||||||
| continue | ||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One of my new tests revealed a pre-existing problem here:
Suggested change
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This suggestion does not seem to fix the test cases |
||||||||||||||
|
|
||||||||||||||
| # If we are at the start of a continuation line, prepend whitespace | ||||||||||||||
| # (we only want to do this when the line starts with an encoded word | ||||||||||||||
| # but if we're folding in this helper function, then we know that we | ||||||||||||||
| # are going to be writing out an encoded word.) | ||||||||||||||
| if len(lines) > 1 and len(lines[-1]) == 1 and leading_whitespace: | ||||||||||||||
| encoded_word = _ew.encode(leading_whitespace, charset=encode_as) | ||||||||||||||
| lines[-1] += encoded_word | ||||||||||||||
| leading_whitespace = '' | ||||||||||||||
|
|
||||||||||||||
| to_encode_word = to_encode[:text_space] | ||||||||||||||
| encoded_word = _ew.encode(to_encode_word, charset=encode_as) | ||||||||||||||
| excess = len(encoded_word) - remaining_space | ||||||||||||||
|
|
@@ -3065,7 +3091,6 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, | |||||||||||||
| excess = len(encoded_word) - remaining_space | ||||||||||||||
| lines[-1] += encoded_word | ||||||||||||||
| to_encode = to_encode[len(to_encode_word):] | ||||||||||||||
| leading_whitespace = '' | ||||||||||||||
|
|
||||||||||||||
| if to_encode: | ||||||||||||||
| lines.append(' ') | ||||||||||||||
|
|
||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -393,6 +393,26 @@ def test_defaults_handle_spaces_at_start_of_continuation_line(self): | |
| g.flatten(msg) | ||
| self.assertEqual(s.getvalue(), expected) | ||
|
|
||
| # gh-144156: fold between non-encoded and encoded words don't need to encoded | ||
| # the separating space | ||
| def test_defaults_handle_spaces_at_start_of_continuation_line_2(self): | ||
| source = ("Re: [SOS-1495488] Commande et livraison - Demande de retour - " | ||
| "bibijolie - 251210-AABBCC - Abo actualités digitales 20 semaines " | ||
| "d’abonnement à 24 heures, Bilan, Tribune de Genève et tous les titres Tamedia") | ||
| expected = ( | ||
| b"Subject: " | ||
| b"Re: [SOS-1495488] Commande et livraison - Demande de retour -\n" | ||
| b" bibijolie - 251210-AABBCC - Abo =?utf-8?q?actualit=C3=A9s?= digitales 20\n" | ||
| b" semaines =?utf-8?q?d=E2=80=99abonnement_=C3=A0?= 24 heures, Bilan, Tribune de\n" | ||
| b" =?utf-8?q?Gen=C3=A8ve?= et tous les titres Tamedia\n\n" | ||
| ) | ||
| msg = EmailMessage() | ||
| msg['Subject'] = source | ||
| s = io.BytesIO() | ||
| g = BytesGenerator(s) | ||
| g.flatten(msg) | ||
| self.assertEqual(s.getvalue(), expected) | ||
|
|
||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here is a test harness I used to verify the code: I'm not sure if we should include a variation on this in the PR or not, but you might want to run it a bunch, maybe tweaking the constants, to see if I missed anything. I ran it for a while with the above parameters and saw no failures. Here are two tests that the above produced, that located the pre-existing bug I mentioned above: |
||
| def test_cte_type_7bit_handles_unknown_8bit(self): | ||
| source = ("Subject: Maintenant je vous présente mon " | ||
| "collègue\n\n").encode('utf-8') | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| Fix folding of email headers violating `RFC 2047`_ with two consecutive encoded words without separating linear-white-space. | ||
|
|
||
| Fix the folding of headers by the :mod:`email` library when :rfc:`2047` encoded words are used. Now whitespace is correctly preserved and also correctly added between adjacent encoded words. The latter property was broken by the fix for gh-92081, which mostly fixed previous failures to preserve whitespace. |
Uh oh!
There was an error while loading. Please reload this page.