Skip to content

Commit 07a336b

Browse files
committed
update docs, no set operations
1 parent 4b50610 commit 07a336b

File tree

3 files changed

+23
-19
lines changed

3 files changed

+23
-19
lines changed

Doc/whatsnew/3.15.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,13 @@ New modules
214214
Improved modules
215215
================
216216

217+
csv
218+
---
219+
220+
* The :meth:`csv.Sniffer.sniff` delimiter detection has been optimized,
221+
and is now up to 1.5x faster.
222+
(Contributed by Maurycy Pawłowski-Wieroński in :gh:`137628`.)
223+
217224
dbm
218225
---
219226

Lib/csv.py

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,6 @@ class excel:
8484
__version__ = "1.0"
8585

8686

87-
_ASCII_CHARS = frozenset(map(chr, range(127))) # 7-bit ASCII
88-
8987
class Dialect:
9088
"""Describe a CSV dialect.
9189
@@ -373,29 +371,28 @@ def _guess_delimiter(self, data, delimiters):
373371
# build frequency tables
374372
chunkLength = min(10, len(data))
375373
iteration = 0
374+
seen = 0
376375
# {char -> {count_per_line -> num_lines_with_that_count}}
377376
charFrequency = defaultdict(Counter)
378377
modes = {}
379378
delims = {}
380379
start, end = 0, chunkLength
381380
while start < len(data):
382381
iteration += 1
383-
chunk = data[start:end]
384-
candidate_chars = set().union(*chunk)
385-
candidate_chars &= _ASCII_CHARS
386-
for line in chunk:
387-
for char in candidate_chars:
388-
count = line.count(char)
389-
charFrequency[char][count] += 1
390-
391-
# must count even if frequency is 0
392-
missing_chars = _ASCII_CHARS - candidate_chars
393-
chunk_len = len(chunk)
394-
for char in missing_chars:
395-
charFrequency[char][0] += chunk_len
396-
397-
for char in charFrequency.keys():
398-
items = list(charFrequency[char].items())
382+
for line in data[start:end]:
383+
seen += 1
384+
charCounts = Counter(line)
385+
for char, count in charCounts.items():
386+
if ord(char) < 127:
387+
charFrequency[char][count] += 1
388+
389+
for char, counts in charFrequency.items():
390+
presentCount = sum(counts.values())
391+
zeroCount = seen - presentCount
392+
if zeroCount > 0:
393+
items = list(counts.items()) + [(0, zeroCount)]
394+
else:
395+
items = list(counts.items())
399396
if len(items) == 1 and items[0][0] == 0:
400397
continue
401398
# get the mode of the frequencies
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
:meth:`csv.Sniffer.sniff` 2x faster
1+
Speed up :meth:`csv.Sniffer.sniff` delimiter detection by up to 1.5x.

0 commit comments

Comments
 (0)