From ef9d22f49f7bc371d2ec6f22c07fbeb1a816a431 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Fri, 6 Jun 2025 02:17:57 +0100 Subject: [PATCH 1/3] gh-135148: Correctly handle f/t strings with comments and debug expressions --- Lib/test/test_fstring.py | 6 ++ ...-06-06-02-24-42.gh-issue-135148.r-t2sC.rst | 3 + Parser/lexer/lexer.c | 85 ++++++++++++++++--- 3 files changed, 81 insertions(+), 13 deletions(-) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-06-06-02-24-42.gh-issue-135148.r-t2sC.rst diff --git a/Lib/test/test_fstring.py b/Lib/test/test_fstring.py index dd58e032a8befe..fcda09fc58d2c7 100644 --- a/Lib/test/test_fstring.py +++ b/Lib/test/test_fstring.py @@ -1651,6 +1651,12 @@ def __repr__(self): self.assertEqual(f"{1+2 = # my comment }", '1+2 = \n 3') + self.assertEqual(f'{""" # booo + """=}', '""" # booo\n """=\' # booo\\n \'') + + self.assertEqual(f'{" # nooo "=}', '" # nooo "=\' # nooo \'') + self.assertEqual(f'{" \" # nooo \" "=}', '" \\" # nooo \\" "=\' " # nooo " \'') + # These next lines contains tabs. Backslash escapes don't # work in f-strings. # patchcheck doesn't like these tabs. So the only way to test diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-06-06-02-24-42.gh-issue-135148.r-t2sC.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-06-06-02-24-42.gh-issue-135148.r-t2sC.rst new file mode 100644 index 00000000000000..9b1f62433b45ed --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-06-06-02-24-42.gh-issue-135148.r-t2sC.rst @@ -0,0 +1,3 @@ +Fixed a bug where f-string debug expressions (using =) would incorrectly +strip out parts of strings containing escaped quotes and # characters. Patch +by Pablo Galindo. diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c index 4d10bccf0a53f2..bfa98b2fe70d43 100644 --- a/Parser/lexer/lexer.c +++ b/Parser/lexer/lexer.c @@ -123,35 +123,96 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { // Check if there is a # character in the expression int hash_detected = 0; + int in_string = 0; + char string_quote = 0; for (Py_ssize_t i = 0; i < tok_mode->last_expr_size - tok_mode->last_expr_end; i++) { - if (tok_mode->last_expr_buffer[i] == '#') { + char ch = tok_mode->last_expr_buffer[i]; + if (ch == '\\' && i + 1 < tok_mode->last_expr_size - tok_mode->last_expr_end) { + // Skip the next character if it's an escape sequence + i++; + continue; + } + if (ch == '"' || ch == '\'') { + if (!in_string) { + in_string = 1; + string_quote = ch; + } else if (ch == string_quote) { + // Check for triple quotes + if (i > 0 && tok_mode->last_expr_buffer[i-1] == ch && + i > 1 && tok_mode->last_expr_buffer[i-2] == ch) { + // Skip the rest of the triple quote + i += 2; + } + in_string = 0; + } + } else if (ch == '#' && !in_string) { hash_detected = 1; break; } } - + // If we found a # character in the expression, we need to handle comments if (hash_detected) { + // Calculate length of input we need to process Py_ssize_t input_length = tok_mode->last_expr_size - tok_mode->last_expr_end; + + // Allocate buffer for processed result, with room for null terminator char *result = (char *)PyMem_Malloc((input_length + 1) * sizeof(char)); if (!result) { return -1; } - Py_ssize_t i = 0; - Py_ssize_t j = 0; + // Initialize counters and state + Py_ssize_t i = 0; // Input position + Py_ssize_t j = 0; // Output position + in_string = 0; // Whether we're currently inside a string + string_quote = 0; // The quote character for current string (' or ") + // Process each character of input for (i = 0, j = 0; i < input_length; i++) { - if (tok_mode->last_expr_buffer[i] == '#') { - // Skip characters until newline or end of string + char ch = tok_mode->last_expr_buffer[i]; + + // Handle escape sequences - copy both backslash and next char + if (ch == '\\' && i + 1 < input_length) { + result[j++] = ch; // Copy backslash + result[j++] = tok_mode->last_expr_buffer[++i]; // Copy escaped char + continue; + } + + // Handle string quotes + if (ch == '"' || ch == '\'') { + if (!in_string) { + // Start of new string + in_string = 1; + string_quote = ch; + } else if (ch == string_quote) { + // Potential end of string - check for triple quotes + if (i > 0 && tok_mode->last_expr_buffer[i-1] == ch && + i > 1 && tok_mode->last_expr_buffer[i-2] == ch) { + // Found triple quote - copy all three quotes + result[j++] = ch; + result[j++] = ch; + result[j++] = ch; + i += 2; // Skip the other two quotes + continue; + } + // End of regular string + in_string = 0; + } + result[j++] = ch; // Copy the quote character + } + // Handle comments - skip everything until newline + else if (ch == '#' && !in_string) { while (i < input_length && tok_mode->last_expr_buffer[i] != '\0') { if (tok_mode->last_expr_buffer[i] == '\n') { - result[j++] = tok_mode->last_expr_buffer[i]; + result[j++] = tok_mode->last_expr_buffer[i]; // Keep newline break; } - i++; + i++; // Skip comment character } - } else { - result[j++] = tok_mode->last_expr_buffer[i]; + } + // Copy any other character unchanged + else { + result[j++] = ch; } } @@ -164,11 +225,9 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { tok_mode->last_expr_size - tok_mode->last_expr_end, NULL ); - } - - if (!res) { + if (!res) { return -1; } token->metadata = res; From fdc81d612ea240e53df787e703097346b6d533fa Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Sat, 7 Jun 2025 02:06:46 +0100 Subject: [PATCH 2/3] Address review --- Lib/test/test_fstring.py | 6 +++ Parser/lexer/lexer.c | 82 ++++++++++++++++------------------------ 2 files changed, 38 insertions(+), 50 deletions(-) diff --git a/Lib/test/test_fstring.py b/Lib/test/test_fstring.py index fcda09fc58d2c7..89d425d6e27aa7 100644 --- a/Lib/test/test_fstring.py +++ b/Lib/test/test_fstring.py @@ -1657,6 +1657,12 @@ def __repr__(self): self.assertEqual(f'{" # nooo "=}', '" # nooo "=\' # nooo \'') self.assertEqual(f'{" \" # nooo \" "=}', '" \\" # nooo \\" "=\' " # nooo " \'') + self.assertEqual(f'{ # some comment goes here + """hello"""=}', ' \n """hello"""=\'hello\'') + self.assertEqual(f'{"""# this is not a comment + a""" # this is a comment + }', '# this is not a comment\n a') + # These next lines contains tabs. Backslash escapes don't # work in f-strings. # patchcheck doesn't like these tabs. So the only way to test diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c index bfa98b2fe70d43..2a461ac49e8cbe 100644 --- a/Parser/lexer/lexer.c +++ b/Parser/lexer/lexer.c @@ -121,99 +121,81 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { } PyObject *res = NULL; - // Check if there is a # character in the expression + // Look for a # character outside of string literals int hash_detected = 0; int in_string = 0; + char quote_char = 0; char string_quote = 0; + for (Py_ssize_t i = 0; i < tok_mode->last_expr_size - tok_mode->last_expr_end; i++) { char ch = tok_mode->last_expr_buffer[i]; - if (ch == '\\' && i + 1 < tok_mode->last_expr_size - tok_mode->last_expr_end) { - // Skip the next character if it's an escape sequence + + // Skip escaped characters + if (ch == '\\') { i++; continue; } + + // Handle quotes if (ch == '"' || ch == '\'') { if (!in_string) { in_string = 1; - string_quote = ch; - } else if (ch == string_quote) { - // Check for triple quotes - if (i > 0 && tok_mode->last_expr_buffer[i-1] == ch && - i > 1 && tok_mode->last_expr_buffer[i-2] == ch) { - // Skip the rest of the triple quote - i += 2; - } + quote_char = ch; + } + else if (ch == quote_char) { in_string = 0; } - } else if (ch == '#' && !in_string) { + continue; + } + + // Check for # outside strings + if (ch == '#' && !in_string) { hash_detected = 1; break; } } // If we found a # character in the expression, we need to handle comments if (hash_detected) { - // Calculate length of input we need to process - Py_ssize_t input_length = tok_mode->last_expr_size - tok_mode->last_expr_end; - - // Allocate buffer for processed result, with room for null terminator - char *result = (char *)PyMem_Malloc((input_length + 1) * sizeof(char)); + // Allocate buffer for processed result + char *result = (char *)PyMem_Malloc((tok_mode->last_expr_size - tok_mode->last_expr_end + 1) * sizeof(char)); if (!result) { return -1; } - // Initialize counters and state Py_ssize_t i = 0; // Input position Py_ssize_t j = 0; // Output position - in_string = 0; // Whether we're currently inside a string - string_quote = 0; // The quote character for current string (' or ") + in_string = 0; // Whether we're in a string + string_quote = 0; // Current string quote char - // Process each character of input - for (i = 0, j = 0; i < input_length; i++) { + // Process each character + while (i < tok_mode->last_expr_size - tok_mode->last_expr_end) { char ch = tok_mode->last_expr_buffer[i]; - // Handle escape sequences - copy both backslash and next char - if (ch == '\\' && i + 1 < input_length) { - result[j++] = ch; // Copy backslash - result[j++] = tok_mode->last_expr_buffer[++i]; // Copy escaped char - continue; - } - // Handle string quotes if (ch == '"' || ch == '\'') { if (!in_string) { - // Start of new string in_string = 1; string_quote = ch; } else if (ch == string_quote) { - // Potential end of string - check for triple quotes - if (i > 0 && tok_mode->last_expr_buffer[i-1] == ch && - i > 1 && tok_mode->last_expr_buffer[i-2] == ch) { - // Found triple quote - copy all three quotes - result[j++] = ch; - result[j++] = ch; - result[j++] = ch; - i += 2; // Skip the other two quotes - continue; - } - // End of regular string in_string = 0; } - result[j++] = ch; // Copy the quote character + result[j++] = ch; } - // Handle comments - skip everything until newline + // Skip comments else if (ch == '#' && !in_string) { - while (i < input_length && tok_mode->last_expr_buffer[i] != '\0') { - if (tok_mode->last_expr_buffer[i] == '\n') { - result[j++] = tok_mode->last_expr_buffer[i]; // Keep newline - break; - } - i++; // Skip comment character + while (i < tok_mode->last_expr_size - tok_mode->last_expr_end && + tok_mode->last_expr_buffer[i] != '\n') { + i++; + } + if (i < tok_mode->last_expr_size - tok_mode->last_expr_end) { + result[j++] = '\n'; } } - // Copy any other character unchanged + // Copy other chars else { result[j++] = ch; } + i++; } result[j] = '\0'; // Null-terminate the result string From 3c86cce55143ed98db6c8a51a431a363eecbd4aa Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Sat, 7 Jun 2025 14:33:49 +0100 Subject: [PATCH 3/3] Fix linting --- Parser/lexer/lexer.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c index 2a461ac49e8cbe..04c9777cd616ae 100644 --- a/Parser/lexer/lexer.c +++ b/Parser/lexer/lexer.c @@ -126,16 +126,16 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { int in_string = 0; char quote_char = 0; char string_quote = 0; - + for (Py_ssize_t i = 0; i < tok_mode->last_expr_size - tok_mode->last_expr_end; i++) { char ch = tok_mode->last_expr_buffer[i]; - + // Skip escaped characters if (ch == '\\') { i++; continue; } - + // Handle quotes if (ch == '"' || ch == '\'') { if (!in_string) { @@ -147,7 +147,7 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { } continue; } - + // Check for # outside strings if (ch == '#' && !in_string) { hash_detected = 1; @@ -183,7 +183,7 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { } // Skip comments else if (ch == '#' && !in_string) { - while (i < tok_mode->last_expr_size - tok_mode->last_expr_end && + while (i < tok_mode->last_expr_size - tok_mode->last_expr_end && tok_mode->last_expr_buffer[i] != '\n') { i++; }