diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 9f01b52f1aff3b..7eaa010c4becf4 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -214,6 +214,13 @@ New modules Improved modules ================ +csv +--- + +* The :meth:`csv.Sniffer.sniff` delimiter detection has been optimized, + and is now up to 1.5x faster. + (Contributed by Maurycy Pawłowski-Wieroński in :gh:`137628`.) + dbm --- diff --git a/Lib/csv.py b/Lib/csv.py index 0a627ba7a512fa..cc7644d694b43d 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -364,31 +364,35 @@ def _guess_delimiter(self, data, delimiters): try and evaluate the smallest portion of the data possible, evaluating additional chunks as necessary. """ + from collections import Counter, defaultdict data = list(filter(None, data.split('\n'))) - ascii = [chr(c) for c in range(127)] # 7-bit ASCII - # build frequency tables chunkLength = min(10, len(data)) iteration = 0 - charFrequency = {} + seen = 0 + # {char -> {count_per_line -> num_lines_with_that_count}} + charFrequency = defaultdict(Counter) modes = {} delims = {} start, end = 0, chunkLength while start < len(data): iteration += 1 for line in data[start:end]: - for char in ascii: - metaFrequency = charFrequency.get(char, {}) - # must count even if frequency is 0 - freq = line.count(char) - # value is the mode - metaFrequency[freq] = metaFrequency.get(freq, 0) + 1 - charFrequency[char] = metaFrequency - - for char in charFrequency.keys(): - items = list(charFrequency[char].items()) + seen += 1 + charCounts = Counter(line) + for char, count in charCounts.items(): + if ord(char) < 127: + charFrequency[char][count] += 1 + + for char, counts in charFrequency.items(): + presentCount = sum(counts.values()) + zeroCount = seen - presentCount + if zeroCount > 0: + items = list(counts.items()) + [(0, zeroCount)] + else: + items = list(counts.items()) if len(items) == 1 and items[0][0] == 0: continue # get the mode of the frequencies diff --git a/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst new file mode 100644 index 00000000000000..82111c1c2f2b28 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst @@ -0,0 +1 @@ +Speed up :meth:`csv.Sniffer.sniff` delimiter detection by up to 1.5x.