update docs, no set operations

maurycy · maurycy · commit 07a336b3bfda · 2025-08-13T12:37:02.000+02:00
diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst
@@ -214,6 +214,13 @@ New modules
 Improved modules
 ================
 
+csv
+---
+
+* The :meth:`csv.Sniffer.sniff` delimiter detection has been optimized,
+  and is now up to 1.5x faster.
+  (Contributed by Maurycy Pawłowski-Wieroński in :gh:`137628`.)
+
 dbm
 ---
 
diff --git a/Lib/csv.py b/Lib/csv.py
@@ -84,8 +84,6 @@ class excel:
 __version__ = "1.0"
 
 
-_ASCII_CHARS = frozenset(map(chr, range(127))) # 7-bit ASCII
-
 class Dialect:
     """Describe a CSV dialect.
 
@@ -373,29 +371,28 @@ def _guess_delimiter(self, data, delimiters):
         # build frequency tables
         chunkLength = min(10, len(data))
         iteration = 0
+        seen = 0
         # {char -> {count_per_line -> num_lines_with_that_count}}
         charFrequency = defaultdict(Counter)
         modes = {}
         delims = {}
         start, end = 0, chunkLength
         while start < len(data):
             iteration += 1
-            chunk = data[start:end]
-            candidate_chars = set().union(*chunk)
-            candidate_chars &= _ASCII_CHARS
-            for line in chunk:
-                for char in candidate_chars:
-                    count = line.count(char)
-                    charFrequency[char][count] += 1
-
-            # must count even if frequency is 0
-            missing_chars = _ASCII_CHARS - candidate_chars
-            chunk_len = len(chunk)
-            for char in missing_chars:
-                charFrequency[char][0] += chunk_len
-
-            for char in charFrequency.keys():
-                items = list(charFrequency[char].items())
+            for line in data[start:end]:
+                seen += 1
+                charCounts = Counter(line)
+                for char, count in charCounts.items():
+                    if ord(char) < 127:
+                        charFrequency[char][count] += 1
+
+            for char, counts in charFrequency.items():
+                presentCount = sum(counts.values())
+                zeroCount = seen - presentCount
+                if zeroCount > 0:
+                    items = list(counts.items()) + [(0, zeroCount)]
+                else:
+                    items = list(counts.items())
                 if len(items) == 1 and items[0][0] == 0:
                     continue
                 # get the mode of the frequencies
diff --git a/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst
@@ -1 +1 @@
-:meth:`csv.Sniffer.sniff` 2x faster
+Speed up :meth:`csv.Sniffer.sniff` delimiter detection by up to 1.5x.

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		-:meth:`csv.Sniffer.sniff` 2x faster
	`1`	+Speed up :meth:`csv.Sniffer.sniff` delimiter detection by up to 1.5x.