From 80be530884552f432c91911cbf92e395d75b3fe3 Mon Sep 17 00:00:00 2001
From: maurycy <5383+maurycy@users.noreply.github.com>
Date: Mon, 11 Aug 2025 04:40:12 +0200
Subject: [PATCH 01/13] do not iterate over all ascii

---
 Lib/csv.py | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/Lib/csv.py b/Lib/csv.py
index 0a627ba7a512fa..0f328eef0f35de 100644
--- a/Lib/csv.py
+++ b/Lib/csv.py
@@ -71,6 +71,7 @@ class excel:
                  QUOTE_STRINGS, QUOTE_NOTNULL
 from _csv import Dialect as _Dialect
 
+from collections import defaultdict, Counter
 from io import StringIO
 
 __all__ = ["QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
@@ -367,25 +368,30 @@ def _guess_delimiter(self, data, delimiters):
 
         data = list(filter(None, data.split('\n')))
 
-        ascii = [chr(c) for c in range(127)] # 7-bit ASCII
+        ascii = {chr(c) for c in range(127)} # 7-bit ASCII
 
         # build frequency tables
         chunkLength = min(10, len(data))
         iteration = 0
-        charFrequency = {}
+        # {char -> {count_per_line -> num_lines_with_that_count}}
+        charFrequency = defaultdict(Counter)
         modes = {}
         delims = {}
         start, end = 0, chunkLength
         while start < len(data):
             iteration += 1
-            for line in data[start:end]:
-                for char in ascii:
-                    metaFrequency = charFrequency.get(char, {})
-                    # must count even if frequency is 0
-                    freq = line.count(char)
-                    # value is the mode
-                    metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
-                    charFrequency[char] = metaFrequency
+            chunk = data[start:end]
+            candidate_chars = set("".join(chunk))
+            candidate_chars.intersection_update(ascii)
+            for line in chunk:
+                for char in candidate_chars:
+                    count = line.count(char)
+                    charFrequency[char][count] += 1
+
+            missing_chars = ascii.difference(candidate_chars)
+            chunk_len = len(chunk)
+            for char in missing_chars:
+                charFrequency[char][0] += chunk_len
 
             for char in charFrequency.keys():
                 items = list(charFrequency[char].items())

From 2d636cf6b93a6c7308c94ea6b1bb08d093196ce2 Mon Sep 17 00:00:00 2001
From: maurycy <5383+maurycy@users.noreply.github.com>
Date: Mon, 11 Aug 2025 04:53:15 +0200
Subject: [PATCH 02/13] NEWS entry

---
 .../next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst  | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst

diff --git a/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst
new file mode 100644
index 00000000000000..752f0347f29625
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst
@@ -0,0 +1 @@
+:meth:`csv.Sniffer._guess_delimiter` 2x faster

From 1f0b25ef6a878deca1d7d28704992712fb6c0592 Mon Sep 17 00:00:00 2001
From: maurycy <5383+maurycy@users.noreply.github.com>
Date: Mon, 11 Aug 2025 04:53:24 +0200
Subject: [PATCH 03/13] bring back the comment

---
 Lib/csv.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Lib/csv.py b/Lib/csv.py
index 0f328eef0f35de..532178bb791165 100644
--- a/Lib/csv.py
+++ b/Lib/csv.py
@@ -388,6 +388,7 @@ def _guess_delimiter(self, data, delimiters):
                     count = line.count(char)
                     charFrequency[char][count] += 1
 
+            # must count even if frequency is 0
             missing_chars = ascii.difference(candidate_chars)
             chunk_len = len(chunk)
             for char in missing_chars:

From 601b2f13a3cb6da6a66b0b215e91b39476487a41 Mon Sep 17 00:00:00 2001
From: maurycy <5383+maurycy@users.noreply.github.com>
Date: Mon, 11 Aug 2025 05:03:07 +0200
Subject: [PATCH 04/13] bang

---
 .../next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst
index 752f0347f29625..ab4612c96c4807 100644
--- a/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst
+++ b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst
@@ -1 +1 @@
-:meth:`csv.Sniffer._guess_delimiter` 2x faster
+:meth:`!csv.Sniffer._guess_delimiter` 2x faster

From 2dc0d413a34713d9b6443c4a1a92c1b94fd39da0 Mon Sep 17 00:00:00 2001
From: maurycy <5383+maurycy@users.noreply.github.com>
Date: Mon, 11 Aug 2025 05:05:55 +0200
Subject: [PATCH 05/13] document the public method

---
 .../next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst
index ab4612c96c4807..0733c5b91ea7a8 100644
--- a/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst
+++ b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst
@@ -1 +1 @@
-:meth:`!csv.Sniffer._guess_delimiter` 2x faster
+:meth:`csv.Sniffer.sniff` 2x faster

From f106da217de4edb713e1002665f526e82be5ca93 Mon Sep 17 00:00:00 2001
From: maurycy <5383+maurycy@users.noreply.github.com>
Date: Mon, 11 Aug 2025 07:32:24 +0200
Subject: [PATCH 06/13] import within Sniffer

---
 Lib/csv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Lib/csv.py b/Lib/csv.py
index 532178bb791165..fe3d97ba0e2e5e 100644
--- a/Lib/csv.py
+++ b/Lib/csv.py
@@ -71,7 +71,6 @@ class excel:
                  QUOTE_STRINGS, QUOTE_NOTNULL
 from _csv import Dialect as _Dialect
 
-from collections import defaultdict, Counter
 from io import StringIO
 
 __all__ = ["QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
@@ -365,6 +364,7 @@ def _guess_delimiter(self, data, delimiters):
         try and evaluate the smallest portion of the data possible, evaluating
         additional chunks as necessary.
         """
+        from collections import Counter, defaultdict
 
         data = list(filter(None, data.split('\n')))
 

From 2f1ea73be0589e4baa4ca8d219c6963871a24855 Mon Sep 17 00:00:00 2001
From: maurycy <5383+maurycy@users.noreply.github.com>
Date: Mon, 11 Aug 2025 10:30:09 +0200
Subject: [PATCH 07/13] _ASCII_CHARS, set operators

---
 Lib/csv.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/Lib/csv.py b/Lib/csv.py
index fe3d97ba0e2e5e..2e2e2fb764ac5b 100644
--- a/Lib/csv.py
+++ b/Lib/csv.py
@@ -84,6 +84,8 @@ class excel:
 __version__ = "1.0"
 
 
+_ASCII_CHARS = frozenset(map(chr, range(127))) # 7-bit ASCII
+
 class Dialect:
     """Describe a CSV dialect.
 
@@ -368,8 +370,6 @@ def _guess_delimiter(self, data, delimiters):
 
         data = list(filter(None, data.split('\n')))
 
-        ascii = {chr(c) for c in range(127)} # 7-bit ASCII
-
         # build frequency tables
         chunkLength = min(10, len(data))
         iteration = 0
@@ -381,15 +381,15 @@ def _guess_delimiter(self, data, delimiters):
         while start < len(data):
             iteration += 1
             chunk = data[start:end]
-            candidate_chars = set("".join(chunk))
-            candidate_chars.intersection_update(ascii)
+            candidate_chars = set().union(*chunk)
+            candidate_chars &= _ASCII_CHARS
             for line in chunk:
                 for char in candidate_chars:
                     count = line.count(char)
                     charFrequency[char][count] += 1
 
             # must count even if frequency is 0
-            missing_chars = ascii.difference(candidate_chars)
+            missing_chars = _ASCII_CHARS - candidate_chars
             chunk_len = len(chunk)
             for char in missing_chars:
                 charFrequency[char][0] += chunk_len

From 07a336b3bfda011c53ad9bbabc74c3184988e76d Mon Sep 17 00:00:00 2001
From: maurycy <5383+maurycy@users.noreply.github.com>
Date: Wed, 13 Aug 2025 12:37:02 +0200
Subject: [PATCH 08/13] update docs, no set operations

---
 Doc/whatsnew/3.15.rst                         |  7 ++++
 Lib/csv.py                                    | 33 +++++++++----------
 ...-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst |  2 +-
 3 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst
index 9f01b52f1aff3b..7eaa010c4becf4 100644
--- a/Doc/whatsnew/3.15.rst
+++ b/Doc/whatsnew/3.15.rst
@@ -214,6 +214,13 @@ New modules
 Improved modules
 ================
 
+csv
+---
+
+* The :meth:`csv.Sniffer.sniff` delimiter detection has been optimized,
+  and is now up to 1.5x faster.
+  (Contributed by Maurycy Pawłowski-Wieroński in :gh:`137628`.)
+
 dbm
 ---
 
diff --git a/Lib/csv.py b/Lib/csv.py
index 2e2e2fb764ac5b..cc7644d694b43d 100644
--- a/Lib/csv.py
+++ b/Lib/csv.py
@@ -84,8 +84,6 @@ class excel:
 __version__ = "1.0"
 
 
-_ASCII_CHARS = frozenset(map(chr, range(127))) # 7-bit ASCII
-
 class Dialect:
     """Describe a CSV dialect.
 
@@ -373,6 +371,7 @@ def _guess_delimiter(self, data, delimiters):
         # build frequency tables
         chunkLength = min(10, len(data))
         iteration = 0
+        seen = 0
         # {char -> {count_per_line -> num_lines_with_that_count}}
         charFrequency = defaultdict(Counter)
         modes = {}
@@ -380,22 +379,20 @@ def _guess_delimiter(self, data, delimiters):
         start, end = 0, chunkLength
         while start < len(data):
             iteration += 1
-            chunk = data[start:end]
-            candidate_chars = set().union(*chunk)
-            candidate_chars &= _ASCII_CHARS
-            for line in chunk:
-                for char in candidate_chars:
-                    count = line.count(char)
-                    charFrequency[char][count] += 1
-
-            # must count even if frequency is 0
-            missing_chars = _ASCII_CHARS - candidate_chars
-            chunk_len = len(chunk)
-            for char in missing_chars:
-                charFrequency[char][0] += chunk_len
-
-            for char in charFrequency.keys():
-                items = list(charFrequency[char].items())
+            for line in data[start:end]:
+                seen += 1
+                charCounts = Counter(line)
+                for char, count in charCounts.items():
+                    if ord(char) < 127:
+                        charFrequency[char][count] += 1
+
+            for char, counts in charFrequency.items():
+                presentCount = sum(counts.values())
+                zeroCount = seen - presentCount
+                if zeroCount > 0:
+                    items = list(counts.items()) + [(0, zeroCount)]
+                else:
+                    items = list(counts.items())
                 if len(items) == 1 and items[0][0] == 0:
                     continue
                 # get the mode of the frequencies
diff --git a/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst
index 0733c5b91ea7a8..82111c1c2f2b28 100644
--- a/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst
+++ b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst
@@ -1 +1 @@
-:meth:`csv.Sniffer.sniff` 2x faster
+Speed up :meth:`csv.Sniffer.sniff` delimiter detection by up to 1.5x.

From 2ccaac0ac434bc7b1fad949a41accc524a0a6513 Mon Sep 17 00:00:00 2001
From: maurycy <5383+maurycy@users.noreply.github.com>
Date: Mon, 18 Aug 2025 20:38:53 +0200
Subject: [PATCH 09/13] Update Lib/csv.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com>
---
 Lib/csv.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Lib/csv.py b/Lib/csv.py
index cc7644d694b43d..244271c8a1fdf8 100644
--- a/Lib/csv.py
+++ b/Lib/csv.py
@@ -381,8 +381,7 @@ def _guess_delimiter(self, data, delimiters):
             iteration += 1
             for line in data[start:end]:
                 seen += 1
-                charCounts = Counter(line)
-                for char, count in charCounts.items():
+                for char, count in Counter(line).items():
                     if ord(char) < 127:
                         charFrequency[char][count] += 1
 

From 36fc9d9bfa221d3d2cdb31726ca46ce8bcb757cc Mon Sep 17 00:00:00 2001
From: maurycy <5383+maurycy@users.noreply.github.com>
Date: Mon, 18 Aug 2025 20:41:15 +0200
Subject: [PATCH 10/13] move whatsnew to Optimizations

---
 Doc/whatsnew/3.15.rst | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst
index e9d379d1794825..824f58086a4fff 100644
--- a/Doc/whatsnew/3.15.rst
+++ b/Doc/whatsnew/3.15.rst
@@ -218,13 +218,6 @@ New modules
 Improved modules
 ================
 
-csv
----
-
-* The :meth:`csv.Sniffer.sniff` delimiter detection has been optimized,
-  and is now up to 1.5x faster.
-  (Contributed by Maurycy Pawłowski-Wieroński in :gh:`137628`.)
-
 dbm
 ---
 
@@ -405,11 +398,12 @@ zlib
 Optimizations
 =============
 
-module_name
------------
-
-* TODO
+csv
+---
 
+* The :meth:`csv.Sniffer.sniff` delimiter detection has been optimized,
+  and is now up to 1.5x faster.
+  (Contributed by Maurycy Pawłowski-Wieroński in :gh:`137628`.)
 
 
 Deprecated

From 7189b5112316c5f18b259c8d8aec320a1fcb5ad4 Mon Sep 17 00:00:00 2001
From: maurycy <5383+maurycy@users.noreply.github.com>
Date: Mon, 18 Aug 2025 20:46:30 +0200
Subject: [PATCH 11/13] s/seen/num_lines/

---
 Lib/csv.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Lib/csv.py b/Lib/csv.py
index 244271c8a1fdf8..a09211edc9654a 100644
--- a/Lib/csv.py
+++ b/Lib/csv.py
@@ -371,7 +371,7 @@ def _guess_delimiter(self, data, delimiters):
         # build frequency tables
         chunkLength = min(10, len(data))
         iteration = 0
-        seen = 0
+        num_lines = 0
         # {char -> {count_per_line -> num_lines_with_that_count}}
         charFrequency = defaultdict(Counter)
         modes = {}
@@ -380,14 +380,14 @@ def _guess_delimiter(self, data, delimiters):
         while start < len(data):
             iteration += 1
             for line in data[start:end]:
-                seen += 1
+                num_lines += 1
                 for char, count in Counter(line).items():
                     if ord(char) < 127:
                         charFrequency[char][count] += 1
 
             for char, counts in charFrequency.items():
                 presentCount = sum(counts.values())
-                zeroCount = seen - presentCount
+                zeroCount = num_lines - presentCount
                 if zeroCount > 0:
                     items = list(counts.items()) + [(0, zeroCount)]
                 else:

From 6b64ba40c1867b81fa9d5ace8cb9c8e13dc00302 Mon Sep 17 00:00:00 2001
From: maurycy <5383+maurycy@users.noreply.github.com>
Date: Mon, 18 Aug 2025 21:15:54 +0200
Subject: [PATCH 12/13] picnixz suggestion

---
 Lib/csv.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/Lib/csv.py b/Lib/csv.py
index a09211edc9654a..e4426b35d71826 100644
--- a/Lib/csv.py
+++ b/Lib/csv.py
@@ -386,12 +386,14 @@ def _guess_delimiter(self, data, delimiters):
                         charFrequency[char][count] += 1
 
             for char, counts in charFrequency.items():
-                presentCount = sum(counts.values())
-                zeroCount = num_lines - presentCount
-                if zeroCount > 0:
-                    items = list(counts.items()) + [(0, zeroCount)]
-                else:
-                    items = list(counts.items())
+                items = list(counts.items())
+                missed_lines = num_lines - sum(counts.values())
+                if missed_lines:
+                    # charFrequency[char][0] can only be deduced now
+                    # as it cannot be obtained when parsing the lines.
+                    assert 0 not in counts.keys()
+                    # Store the number of lines 'char' was missing from.
+                    items.append((0, missed_lines))
                 if len(items) == 1 and items[0][0] == 0:
                     continue
                 # get the mode of the frequencies

From 4b62c848bae2e2a28e8d70a8aa00ea35b5f5b0ca Mon Sep 17 00:00:00 2001
From: maurycy <5383+maurycy@users.noreply.github.com>
Date: Mon, 18 Aug 2025 21:17:06 +0200
Subject: [PATCH 13/13] use isascii

---
 Lib/csv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Lib/csv.py b/Lib/csv.py
index e4426b35d71826..32ac6131d7dd0d 100644
--- a/Lib/csv.py
+++ b/Lib/csv.py
@@ -382,7 +382,7 @@ def _guess_delimiter(self, data, delimiters):
             for line in data[start:end]:
                 num_lines += 1
                 for char, count in Counter(line).items():
-                    if ord(char) < 127:
+                    if char.isascii():
                         charFrequency[char][count] += 1
 
             for char, counts in charFrequency.items():