Get new statistics for a text file in Python

Question

Get new statistics for a text file in Python

I had an unpleasant CRLF / LF conflict in a git file, which was probably fixed on a Windows machine. Is there a cross-platform way (preferably in Python) to determine what type of newlines dominates the file?

I have this code (based on the idea of https://stackoverflow.com/a/3/910962/ ... ):

import sys
if not sys.argv[1:]:
  sys.exit('usage: %s <filename>' % sys.argv[0])

with open(sys.argv[1],"rb") as f:
  d = f.read()
  crlf, lfcr = d.count('\r\n'), d.count('\n\r')
  cr, lf = d.count('\r'), d.count('\n')
  print('crlf: %s' % crlf)
  print('lfcr: %s' % lfcr)
  print('cr: %s' % cr)
  print('lf: %s' % lf)
  print('\ncr-crlf-lfcr: %s' % (cr - crlf - lfcr))
  print('lf-crlf-lfcr: %s' % (lf - crlf - lfcr))
  print('\ntotal (lf+cr-2*crlf-2*lfcr): %s\n' % (lf + cr - 2*crlf - 2*lfcr))

But this gives incorrect statistics (for this file ):

crlf: 1123
lfcr: 58
cr: 1123
lf: 1123

cr-crlf-lfcr: -58
lf-crlf-lfcr: -58

total (lf+cr-2*crlf-2*lfcr): -116

+1

python newline

anatoly techtonik Apr 17 '15 at 9:49

source share

4 answers

The submitted code does not work properly. bcause The counter counts the characters in the file - it does not look for pairs of characters, such as \r\nand \n\r.

Python 2.6, 4 EOL \r\n, \n\r, \r \n . , \r\n \n\r EOL char.

; , .

#!/usr/bin/env python

''' Find and count various line ending character combinations

    From http://stackoverflow.com/q/29695861/4014959

    Written by PM 2Ring 2015.04.17
'''

import random
import re
from itertools import groupby

random.seed(42)

#Make a random text string containing various EOL combinations
tokens = list(2*'ABCDEFGHIJK ' + '\r\n') + ['\r\n', '\n\r']
datasize = 300
data = ''.join([random.choice(tokens) for _ in range(datasize)])
print repr(data), '\n'

#regex to find various EOL combinations
pat = re.compile(r'\r\n|\n\r|\r|\n')

eols = pat.findall(data)
print eols, '\n'

grouped = [(len(list(group)), key) for key, group in groupby(sorted(eols))]
print sorted(grouped, reverse=True)

'FAHGIG\rC AGCAFGDGEKAKHJE\r\nJCC EKID\n\rKD F\rEHBGICGCHFKKFH\r\nGFEIEK\n\rFDH JGAIHF\r\n\rIG \nAHGDHE\n G\n\rCCBDFK BK\n\rC\n\r\rAIHDHFDAA\r\n\rHCF\n\rIFFEJDJCAJA\r\n\r IB\r\r\nCBBJJDBDH\r FDIFI\n\rGACDGJEGGBFG\n\rBGGFD\r\nDBJKFCA BIG\n\rC J\rGFA HG\nA\rDB\n\r \n\r\n EBF BK\n\rHJA \r\n\n\rDIEI\n\rEDIBEC E\r\nCFEGGD\rGEF EC\r\nFIG GIIJCA\n\r\n\rCFH\r\n\r\rKE HF\n\rGAKIG\r\nDDCDHEIFFHB\n C HAJFHID AC\r' 

['\r', '\r\n', '\n\r', '\r', '\r\n', '\n\r', '\r\n', '\r', '\n', '\n', '\n\r', '\n\r', '\n\r', '\r', '\r\n', '\r', '\n\r', '\r\n', '\r', '\r', '\r\n', '\r', '\n\r', '\n\r', '\r\n', '\n\r', '\r', '\n', '\r', '\n\r', '\n\r', '\n', '\n\r', '\r\n', '\n\r', '\n\r', '\r\n', '\r', '\r\n', '\n\r', '\n\r', '\r\n', '\r', '\r', '\n\r', '\r\n', '\n', '\r'] 

[(17, '\n\r'), (14, '\r'), (12, '\r\n'), (5, '\n')]

, , .

import re
from itertools import groupby
import sys

if not sys.argv[1:]:
    exit('usage: %s <filename>' % sys.argv[0])

with open(sys.argv[1], 'rb') as f:
    data = f.read()

print repr(data), '\n'

#regex to find various EOL combinations
pat = re.compile(r'\r\n|\n\r|\r|\n')

eols = pat.findall(data)
print eols, '\n'

grouped = [(len(list(group)), key) for key, group in groupby(sorted(eols))]
print sorted(grouped, reverse=True)

+1

PM 2Ring 17 . '15 11:18

, , , : \r\n\r\n\r\n. :

crlf: 3 -- [\r\n][\r\n][\r\n]
lfcr: 2 -- \r[\n\r][\n\r]\n
cr: 3   -- [\r]\n[\r]\n[\r]\n
lf: 3   -- \r[\n]\r[\n]\r[\n]

cr-crlf-lfcr: -2
lf-crlf-lfcr: -2

total (lf+cr-2*crlf-2*lfcr): -4

, \n \r crlf lfcr. line.endswith(). cr lf, \r\n \n\r cr + 1 lf + 1.

+1

go2 17 . '15 11:20

source share

The best way to handle line endings in git is to use the git configuration. You can determine what needs to be done for line endings around the world, in a specific repository, or for specific files. In the file, .gitattributesyou can determine that certain files must be converted to your own system lines for each check and converted back during the checks. For more information, see GitHub Line End Help .

+1

Mykhaylo kopytonenko Apr 17 '15 at 11:23

source share

sorrat · Accepted Answer · 2015-04-17T11:17:47+0000

import sys


def calculate_line_endings(filename):
    cr = lf = crlf = lfcr = 0
    for line in open(filename, "rb"):
        if line.endswith('\r\n'):
            crlf += 1
        elif line.endswith('\n\r'):
            lfcr += 1
        elif line.endswith('\r'):
            cr += 1
        elif line.endswith('\n'):
            lf += 1

    print('crlf: %s' % crlf)
    print('lfcr: %s' % lfcr)
    print('cr: %s' % cr)
    print('lf: %s' % lf)


if __name__ == '__main__':
    if len(sys.argv) == 1:
        sys.exit('usage: %s <filename>' % sys.argv[0])
    else:
        calculate_line_endings(sys.argv[1])

Gives output for your file

crlf: 1123
lfcr: 0
cr: 0
lf: 0

is that enough?

Get new statistics for a text file in Python

More articles: