From 80be530884552f432c91911cbf92e395d75b3fe3 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Mon, 11 Aug 2025 04:40:12 +0200 Subject: [PATCH 01/13] do not iterate over all ascii --- Lib/csv.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/Lib/csv.py b/Lib/csv.py index 0a627ba7a512fa..0f328eef0f35de 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -71,6 +71,7 @@ class excel: QUOTE_STRINGS, QUOTE_NOTNULL from _csv import Dialect as _Dialect +from collections import defaultdict, Counter from io import StringIO __all__ = ["QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE", @@ -367,25 +368,30 @@ def _guess_delimiter(self, data, delimiters): data = list(filter(None, data.split('\n'))) - ascii = [chr(c) for c in range(127)] # 7-bit ASCII + ascii = {chr(c) for c in range(127)} # 7-bit ASCII # build frequency tables chunkLength = min(10, len(data)) iteration = 0 - charFrequency = {} + # {char -> {count_per_line -> num_lines_with_that_count}} + charFrequency = defaultdict(Counter) modes = {} delims = {} start, end = 0, chunkLength while start < len(data): iteration += 1 - for line in data[start:end]: - for char in ascii: - metaFrequency = charFrequency.get(char, {}) - # must count even if frequency is 0 - freq = line.count(char) - # value is the mode - metaFrequency[freq] = metaFrequency.get(freq, 0) + 1 - charFrequency[char] = metaFrequency + chunk = data[start:end] + candidate_chars = set("".join(chunk)) + candidate_chars.intersection_update(ascii) + for line in chunk: + for char in candidate_chars: + count = line.count(char) + charFrequency[char][count] += 1 + + missing_chars = ascii.difference(candidate_chars) + chunk_len = len(chunk) + for char in missing_chars: + charFrequency[char][0] += chunk_len for char in charFrequency.keys(): items = list(charFrequency[char].items()) From 2d636cf6b93a6c7308c94ea6b1bb08d093196ce2 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Mon, 11 Aug 2025 04:53:15 +0200 Subject: [PATCH 02/13] NEWS entry --- .../next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst diff --git a/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst new file mode 100644 index 00000000000000..752f0347f29625 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst @@ -0,0 +1 @@ +:meth:`csv.Sniffer._guess_delimiter` 2x faster From 1f0b25ef6a878deca1d7d28704992712fb6c0592 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Mon, 11 Aug 2025 04:53:24 +0200 Subject: [PATCH 03/13] bring back the comment --- Lib/csv.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/csv.py b/Lib/csv.py index 0f328eef0f35de..532178bb791165 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -388,6 +388,7 @@ def _guess_delimiter(self, data, delimiters): count = line.count(char) charFrequency[char][count] += 1 + # must count even if frequency is 0 missing_chars = ascii.difference(candidate_chars) chunk_len = len(chunk) for char in missing_chars: From 601b2f13a3cb6da6a66b0b215e91b39476487a41 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Mon, 11 Aug 2025 05:03:07 +0200 Subject: [PATCH 04/13] bang --- .../next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst index 752f0347f29625..ab4612c96c4807 100644 --- a/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst +++ b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst @@ -1 +1 @@ -:meth:`csv.Sniffer._guess_delimiter` 2x faster +:meth:`!csv.Sniffer._guess_delimiter` 2x faster From 2dc0d413a34713d9b6443c4a1a92c1b94fd39da0 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Mon, 11 Aug 2025 05:05:55 +0200 Subject: [PATCH 05/13] document the public method --- .../next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst index ab4612c96c4807..0733c5b91ea7a8 100644 --- a/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst +++ b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst @@ -1 +1 @@ -:meth:`!csv.Sniffer._guess_delimiter` 2x faster +:meth:`csv.Sniffer.sniff` 2x faster From f106da217de4edb713e1002665f526e82be5ca93 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Mon, 11 Aug 2025 07:32:24 +0200 Subject: [PATCH 06/13] import within Sniffer --- Lib/csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/csv.py b/Lib/csv.py index 532178bb791165..fe3d97ba0e2e5e 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -71,7 +71,6 @@ class excel: QUOTE_STRINGS, QUOTE_NOTNULL from _csv import Dialect as _Dialect -from collections import defaultdict, Counter from io import StringIO __all__ = ["QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE", @@ -365,6 +364,7 @@ def _guess_delimiter(self, data, delimiters): try and evaluate the smallest portion of the data possible, evaluating additional chunks as necessary. """ + from collections import Counter, defaultdict data = list(filter(None, data.split('\n'))) From 2f1ea73be0589e4baa4ca8d219c6963871a24855 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Mon, 11 Aug 2025 10:30:09 +0200 Subject: [PATCH 07/13] _ASCII_CHARS, set operators --- Lib/csv.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Lib/csv.py b/Lib/csv.py index fe3d97ba0e2e5e..2e2e2fb764ac5b 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -84,6 +84,8 @@ class excel: __version__ = "1.0" +_ASCII_CHARS = frozenset(map(chr, range(127))) # 7-bit ASCII + class Dialect: """Describe a CSV dialect. @@ -368,8 +370,6 @@ def _guess_delimiter(self, data, delimiters): data = list(filter(None, data.split('\n'))) - ascii = {chr(c) for c in range(127)} # 7-bit ASCII - # build frequency tables chunkLength = min(10, len(data)) iteration = 0 @@ -381,15 +381,15 @@ def _guess_delimiter(self, data, delimiters): while start < len(data): iteration += 1 chunk = data[start:end] - candidate_chars = set("".join(chunk)) - candidate_chars.intersection_update(ascii) + candidate_chars = set().union(*chunk) + candidate_chars &= _ASCII_CHARS for line in chunk: for char in candidate_chars: count = line.count(char) charFrequency[char][count] += 1 # must count even if frequency is 0 - missing_chars = ascii.difference(candidate_chars) + missing_chars = _ASCII_CHARS - candidate_chars chunk_len = len(chunk) for char in missing_chars: charFrequency[char][0] += chunk_len From 07a336b3bfda011c53ad9bbabc74c3184988e76d Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Wed, 13 Aug 2025 12:37:02 +0200 Subject: [PATCH 08/13] update docs, no set operations --- Doc/whatsnew/3.15.rst | 7 ++++ Lib/csv.py | 33 +++++++++---------- ...-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst | 2 +- 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 9f01b52f1aff3b..7eaa010c4becf4 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -214,6 +214,13 @@ New modules Improved modules ================ +csv +--- + +* The :meth:`csv.Sniffer.sniff` delimiter detection has been optimized, + and is now up to 1.5x faster. + (Contributed by Maurycy Pawłowski-Wieroński in :gh:`137628`.) + dbm --- diff --git a/Lib/csv.py b/Lib/csv.py index 2e2e2fb764ac5b..cc7644d694b43d 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -84,8 +84,6 @@ class excel: __version__ = "1.0" -_ASCII_CHARS = frozenset(map(chr, range(127))) # 7-bit ASCII - class Dialect: """Describe a CSV dialect. @@ -373,6 +371,7 @@ def _guess_delimiter(self, data, delimiters): # build frequency tables chunkLength = min(10, len(data)) iteration = 0 + seen = 0 # {char -> {count_per_line -> num_lines_with_that_count}} charFrequency = defaultdict(Counter) modes = {} @@ -380,22 +379,20 @@ def _guess_delimiter(self, data, delimiters): start, end = 0, chunkLength while start < len(data): iteration += 1 - chunk = data[start:end] - candidate_chars = set().union(*chunk) - candidate_chars &= _ASCII_CHARS - for line in chunk: - for char in candidate_chars: - count = line.count(char) - charFrequency[char][count] += 1 - - # must count even if frequency is 0 - missing_chars = _ASCII_CHARS - candidate_chars - chunk_len = len(chunk) - for char in missing_chars: - charFrequency[char][0] += chunk_len - - for char in charFrequency.keys(): - items = list(charFrequency[char].items()) + for line in data[start:end]: + seen += 1 + charCounts = Counter(line) + for char, count in charCounts.items(): + if ord(char) < 127: + charFrequency[char][count] += 1 + + for char, counts in charFrequency.items(): + presentCount = sum(counts.values()) + zeroCount = seen - presentCount + if zeroCount > 0: + items = list(counts.items()) + [(0, zeroCount)] + else: + items = list(counts.items()) if len(items) == 1 and items[0][0] == 0: continue # get the mode of the frequencies diff --git a/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst index 0733c5b91ea7a8..82111c1c2f2b28 100644 --- a/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst +++ b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst @@ -1 +1 @@ -:meth:`csv.Sniffer.sniff` 2x faster +Speed up :meth:`csv.Sniffer.sniff` delimiter detection by up to 1.5x. From 2ccaac0ac434bc7b1fad949a41accc524a0a6513 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Mon, 18 Aug 2025 20:38:53 +0200 Subject: [PATCH 09/13] Update Lib/csv.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> --- Lib/csv.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Lib/csv.py b/Lib/csv.py index cc7644d694b43d..244271c8a1fdf8 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -381,8 +381,7 @@ def _guess_delimiter(self, data, delimiters): iteration += 1 for line in data[start:end]: seen += 1 - charCounts = Counter(line) - for char, count in charCounts.items(): + for char, count in Counter(line).items(): if ord(char) < 127: charFrequency[char][count] += 1 From 36fc9d9bfa221d3d2cdb31726ca46ce8bcb757cc Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Mon, 18 Aug 2025 20:41:15 +0200 Subject: [PATCH 10/13] move whatsnew to Optimizations --- Doc/whatsnew/3.15.rst | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index e9d379d1794825..824f58086a4fff 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -218,13 +218,6 @@ New modules Improved modules ================ -csv ---- - -* The :meth:`csv.Sniffer.sniff` delimiter detection has been optimized, - and is now up to 1.5x faster. - (Contributed by Maurycy Pawłowski-Wieroński in :gh:`137628`.) - dbm --- @@ -405,11 +398,12 @@ zlib Optimizations ============= -module_name ------------ - -* TODO +csv +--- +* The :meth:`csv.Sniffer.sniff` delimiter detection has been optimized, + and is now up to 1.5x faster. + (Contributed by Maurycy Pawłowski-Wieroński in :gh:`137628`.) Deprecated From 7189b5112316c5f18b259c8d8aec320a1fcb5ad4 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Mon, 18 Aug 2025 20:46:30 +0200 Subject: [PATCH 11/13] s/seen/num_lines/ --- Lib/csv.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/csv.py b/Lib/csv.py index 244271c8a1fdf8..a09211edc9654a 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -371,7 +371,7 @@ def _guess_delimiter(self, data, delimiters): # build frequency tables chunkLength = min(10, len(data)) iteration = 0 - seen = 0 + num_lines = 0 # {char -> {count_per_line -> num_lines_with_that_count}} charFrequency = defaultdict(Counter) modes = {} @@ -380,14 +380,14 @@ def _guess_delimiter(self, data, delimiters): while start < len(data): iteration += 1 for line in data[start:end]: - seen += 1 + num_lines += 1 for char, count in Counter(line).items(): if ord(char) < 127: charFrequency[char][count] += 1 for char, counts in charFrequency.items(): presentCount = sum(counts.values()) - zeroCount = seen - presentCount + zeroCount = num_lines - presentCount if zeroCount > 0: items = list(counts.items()) + [(0, zeroCount)] else: From 6b64ba40c1867b81fa9d5ace8cb9c8e13dc00302 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Mon, 18 Aug 2025 21:15:54 +0200 Subject: [PATCH 12/13] picnixz suggestion --- Lib/csv.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/Lib/csv.py b/Lib/csv.py index a09211edc9654a..e4426b35d71826 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -386,12 +386,14 @@ def _guess_delimiter(self, data, delimiters): charFrequency[char][count] += 1 for char, counts in charFrequency.items(): - presentCount = sum(counts.values()) - zeroCount = num_lines - presentCount - if zeroCount > 0: - items = list(counts.items()) + [(0, zeroCount)] - else: - items = list(counts.items()) + items = list(counts.items()) + missed_lines = num_lines - sum(counts.values()) + if missed_lines: + # charFrequency[char][0] can only be deduced now + # as it cannot be obtained when parsing the lines. + assert 0 not in counts.keys() + # Store the number of lines 'char' was missing from. + items.append((0, missed_lines)) if len(items) == 1 and items[0][0] == 0: continue # get the mode of the frequencies From 4b62c848bae2e2a28e8d70a8aa00ea35b5f5b0ca Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Mon, 18 Aug 2025 21:17:06 +0200 Subject: [PATCH 13/13] use isascii --- Lib/csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/csv.py b/Lib/csv.py index e4426b35d71826..32ac6131d7dd0d 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -382,7 +382,7 @@ def _guess_delimiter(self, data, delimiters): for line in data[start:end]: num_lines += 1 for char, count in Counter(line).items(): - if ord(char) < 127: + if char.isascii(): charFrequency[char][count] += 1 for char, counts in charFrequency.items():