From 80be530884552f432c91911cbf92e395d75b3fe3 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Mon, 11 Aug 2025 04:40:12 +0200 Subject: [PATCH 1/8] do not iterate over all ascii --- Lib/csv.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/Lib/csv.py b/Lib/csv.py index 0a627ba7a512fa..0f328eef0f35de 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -71,6 +71,7 @@ class excel: QUOTE_STRINGS, QUOTE_NOTNULL from _csv import Dialect as _Dialect +from collections import defaultdict, Counter from io import StringIO __all__ = ["QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE", @@ -367,25 +368,30 @@ def _guess_delimiter(self, data, delimiters): data = list(filter(None, data.split('\n'))) - ascii = [chr(c) for c in range(127)] # 7-bit ASCII + ascii = {chr(c) for c in range(127)} # 7-bit ASCII # build frequency tables chunkLength = min(10, len(data)) iteration = 0 - charFrequency = {} + # {char -> {count_per_line -> num_lines_with_that_count}} + charFrequency = defaultdict(Counter) modes = {} delims = {} start, end = 0, chunkLength while start < len(data): iteration += 1 - for line in data[start:end]: - for char in ascii: - metaFrequency = charFrequency.get(char, {}) - # must count even if frequency is 0 - freq = line.count(char) - # value is the mode - metaFrequency[freq] = metaFrequency.get(freq, 0) + 1 - charFrequency[char] = metaFrequency + chunk = data[start:end] + candidate_chars = set("".join(chunk)) + candidate_chars.intersection_update(ascii) + for line in chunk: + for char in candidate_chars: + count = line.count(char) + charFrequency[char][count] += 1 + + missing_chars = ascii.difference(candidate_chars) + chunk_len = len(chunk) + for char in missing_chars: + charFrequency[char][0] += chunk_len for char in charFrequency.keys(): items = list(charFrequency[char].items()) From 2d636cf6b93a6c7308c94ea6b1bb08d093196ce2 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Mon, 11 Aug 2025 04:53:15 +0200 Subject: [PATCH 2/8] NEWS entry --- .../next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst diff --git a/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst new file mode 100644 index 00000000000000..752f0347f29625 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst @@ -0,0 +1 @@ +:meth:`csv.Sniffer._guess_delimiter` 2x faster From 1f0b25ef6a878deca1d7d28704992712fb6c0592 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Mon, 11 Aug 2025 04:53:24 +0200 Subject: [PATCH 3/8] bring back the comment --- Lib/csv.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/csv.py b/Lib/csv.py index 0f328eef0f35de..532178bb791165 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -388,6 +388,7 @@ def _guess_delimiter(self, data, delimiters): count = line.count(char) charFrequency[char][count] += 1 + # must count even if frequency is 0 missing_chars = ascii.difference(candidate_chars) chunk_len = len(chunk) for char in missing_chars: From 601b2f13a3cb6da6a66b0b215e91b39476487a41 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Mon, 11 Aug 2025 05:03:07 +0200 Subject: [PATCH 4/8] bang --- .../next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst index 752f0347f29625..ab4612c96c4807 100644 --- a/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst +++ b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst @@ -1 +1 @@ -:meth:`csv.Sniffer._guess_delimiter` 2x faster +:meth:`!csv.Sniffer._guess_delimiter` 2x faster From 2dc0d413a34713d9b6443c4a1a92c1b94fd39da0 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Mon, 11 Aug 2025 05:05:55 +0200 Subject: [PATCH 5/8] document the public method --- .../next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst index ab4612c96c4807..0733c5b91ea7a8 100644 --- a/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst +++ b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst @@ -1 +1 @@ -:meth:`!csv.Sniffer._guess_delimiter` 2x faster +:meth:`csv.Sniffer.sniff` 2x faster From f106da217de4edb713e1002665f526e82be5ca93 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Mon, 11 Aug 2025 07:32:24 +0200 Subject: [PATCH 6/8] import within Sniffer --- Lib/csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/csv.py b/Lib/csv.py index 532178bb791165..fe3d97ba0e2e5e 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -71,7 +71,6 @@ class excel: QUOTE_STRINGS, QUOTE_NOTNULL from _csv import Dialect as _Dialect -from collections import defaultdict, Counter from io import StringIO __all__ = ["QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE", @@ -365,6 +364,7 @@ def _guess_delimiter(self, data, delimiters): try and evaluate the smallest portion of the data possible, evaluating additional chunks as necessary. """ + from collections import Counter, defaultdict data = list(filter(None, data.split('\n'))) From 2f1ea73be0589e4baa4ca8d219c6963871a24855 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Mon, 11 Aug 2025 10:30:09 +0200 Subject: [PATCH 7/8] _ASCII_CHARS, set operators --- Lib/csv.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Lib/csv.py b/Lib/csv.py index fe3d97ba0e2e5e..2e2e2fb764ac5b 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -84,6 +84,8 @@ class excel: __version__ = "1.0" +_ASCII_CHARS = frozenset(map(chr, range(127))) # 7-bit ASCII + class Dialect: """Describe a CSV dialect. @@ -368,8 +370,6 @@ def _guess_delimiter(self, data, delimiters): data = list(filter(None, data.split('\n'))) - ascii = {chr(c) for c in range(127)} # 7-bit ASCII - # build frequency tables chunkLength = min(10, len(data)) iteration = 0 @@ -381,15 +381,15 @@ def _guess_delimiter(self, data, delimiters): while start < len(data): iteration += 1 chunk = data[start:end] - candidate_chars = set("".join(chunk)) - candidate_chars.intersection_update(ascii) + candidate_chars = set().union(*chunk) + candidate_chars &= _ASCII_CHARS for line in chunk: for char in candidate_chars: count = line.count(char) charFrequency[char][count] += 1 # must count even if frequency is 0 - missing_chars = ascii.difference(candidate_chars) + missing_chars = _ASCII_CHARS - candidate_chars chunk_len = len(chunk) for char in missing_chars: charFrequency[char][0] += chunk_len From 07a336b3bfda011c53ad9bbabc74c3184988e76d Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Wed, 13 Aug 2025 12:37:02 +0200 Subject: [PATCH 8/8] update docs, no set operations --- Doc/whatsnew/3.15.rst | 7 ++++ Lib/csv.py | 33 +++++++++---------- ...-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst | 2 +- 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 9f01b52f1aff3b..7eaa010c4becf4 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -214,6 +214,13 @@ New modules Improved modules ================ +csv +--- + +* The :meth:`csv.Sniffer.sniff` delimiter detection has been optimized, + and is now up to 1.5x faster. + (Contributed by Maurycy Pawłowski-Wieroński in :gh:`137628`.) + dbm --- diff --git a/Lib/csv.py b/Lib/csv.py index 2e2e2fb764ac5b..cc7644d694b43d 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -84,8 +84,6 @@ class excel: __version__ = "1.0" -_ASCII_CHARS = frozenset(map(chr, range(127))) # 7-bit ASCII - class Dialect: """Describe a CSV dialect. @@ -373,6 +371,7 @@ def _guess_delimiter(self, data, delimiters): # build frequency tables chunkLength = min(10, len(data)) iteration = 0 + seen = 0 # {char -> {count_per_line -> num_lines_with_that_count}} charFrequency = defaultdict(Counter) modes = {} @@ -380,22 +379,20 @@ def _guess_delimiter(self, data, delimiters): start, end = 0, chunkLength while start < len(data): iteration += 1 - chunk = data[start:end] - candidate_chars = set().union(*chunk) - candidate_chars &= _ASCII_CHARS - for line in chunk: - for char in candidate_chars: - count = line.count(char) - charFrequency[char][count] += 1 - - # must count even if frequency is 0 - missing_chars = _ASCII_CHARS - candidate_chars - chunk_len = len(chunk) - for char in missing_chars: - charFrequency[char][0] += chunk_len - - for char in charFrequency.keys(): - items = list(charFrequency[char].items()) + for line in data[start:end]: + seen += 1 + charCounts = Counter(line) + for char, count in charCounts.items(): + if ord(char) < 127: + charFrequency[char][count] += 1 + + for char, counts in charFrequency.items(): + presentCount = sum(counts.values()) + zeroCount = seen - presentCount + if zeroCount > 0: + items = list(counts.items()) + [(0, zeroCount)] + else: + items = list(counts.items()) if len(items) == 1 and items[0][0] == 0: continue # get the mode of the frequencies diff --git a/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst index 0733c5b91ea7a8..82111c1c2f2b28 100644 --- a/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst +++ b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst @@ -1 +1 @@ -:meth:`csv.Sniffer.sniff` 2x faster +Speed up :meth:`csv.Sniffer.sniff` delimiter detection by up to 1.5x.