@@ -362,31 +362,33 @@ def _guess_delimiter(self, data, delimiters):
362362 try and evaluate the smallest portion of the data possible, evaluating
363363 additional chunks as necessary.
364364 """
365+ from collections import Counter , defaultdict
365366
366367 data = list (filter (None , data .split ('\n ' )))
367368
368- ascii = [chr (c ) for c in range (127 )] # 7-bit ASCII
369-
370369 # build frequency tables
371370 chunkLength = min (10 , len (data ))
372371 iteration = 0
373- charFrequency = {}
372+ num_lines = 0
373+ # {char -> {count_per_line -> num_lines_with_that_count}}
374+ char_frequency = defaultdict (Counter )
374375 modes = {}
375376 delims = {}
376377 start , end = 0 , chunkLength
377378 while start < len (data ):
378379 iteration += 1
379380 for line in data [start :end ]:
380- for char in ascii :
381- metaFrequency = charFrequency .get (char , {})
382- # must count even if frequency is 0
383- freq = line .count (char )
384- # value is the mode
385- metaFrequency [freq ] = metaFrequency .get (freq , 0 ) + 1
386- charFrequency [char ] = metaFrequency
387-
388- for char in charFrequency .keys ():
389- items = list (charFrequency [char ].items ())
381+ num_lines += 1
382+ for char , count in Counter (line ).items ():
383+ if char .isascii ():
384+ char_frequency [char ][count ] += 1
385+
386+ for char , counts in char_frequency .items ():
387+ items = list (counts .items ())
388+ missed_lines = num_lines - sum (counts .values ())
389+ if missed_lines :
390+ # Store the number of lines 'char' was missing from.
391+ items .append ((0 , missed_lines ))
390392 if len (items ) == 1 and items [0 ][0 ] == 0 :
391393 continue
392394 # get the mode of the frequencies
0 commit comments