You must remove line breaks from a text file with certain conditions

I have a text file running on 20,000 lines. The block of significant data for me will consist of a name, address, city, state, zip code, phone. My file has each of them in a new line, so the file will look like this:

StoreName1
, Address
, City
,State
,Zip
, Phone

StoreName2
, Address
, City
,State
,Zip
, Phone

I need to create a CSV file and you will need the above information for each store in one separate line:

StoreName1, Address, City,State,Zip, Phone
StoreName2, Address, City,State,Zip, Phone

So essentially, I am trying to remove \ r \ n only at the corresponding points. How to do this with python re. Examples would be very helpful, new to this.

Thank.

+3
source share
5 answers

s/[\r\n]+,/,/g

Replace globally 'linebreak (s),' with ','

Edit:
:

s/[\r\n]+(,|[\r\n])/$1/g

"" ( linebreak) 1.

Edit:
, , :

s/[\r\n]+\s*(,|[\r\n])\s*/$1/g

+3

/

from itertools import groupby
with open("inputfile.txt") as f:
    groups = groupby(f, key=str.isspace)
    for row in ("".join(map(str.strip,x[1])) for x in groups if not x[0]):
        ...
+2

, "" - . - :

with open('data.txt') as fhi, open('newdata.txt', 'w') as fho:
  # Iterate over the input file.
  for store in fhi:
    # Read in the rest of the pertinent data
    fields = [next(fhi).rstrip() for _ in range(5)]

    # Generate a list of all fields for this store.
    row = [store.rstrip()] + fields

    # Output to the new data file.
    fho.write('%s\n' % ''.join(row))

    # Consume a blank line in the input file.
    next(fhi)
+1

import re

ch = ('StoreName1\r\n'
      ', Address\r\n'
      ', City\r\n'
      ',State\r\n'
      ',Zip\r\n'
      ', Phone\r\n'
      '\r\n'
      'StoreName2\r\n'
      ', Address\r\n'
      ', City\r\n'
      ',State\r\n'
      ',Zip\r\n'
      ', Phone')

regx = re.compile('(?:(?<=\r\n\r\n)|(?<=\A)|(?<=\A\r\n))'
                  '(.+?)\r\n(,.+?)\r\n(,.+?)\r\n(,.+?)\r\n(,.+?)\r\n(,[^\r\n]+)')

with open('csvoutput.txt','wb') as f:
    f.writelines(''.join(mat.groups())+'\r\n' for mat in regx.finditer(ch))

ch Windows ( ==\r\n)

regx = re.compile('(?:(?<=\r\n\r\n)|(?<=\A)|(?<=\A\r\n))'
                  '.+?\r\n,.+?\r\n,.+?\r\n,.+?\r\n,.+?\r\n,[^\r\n]+')

with open('csvoutput.txt','wb') as f:
    f.writelines(mat.group().replace('\r\n','')+'\r\n' for mat in regx.finditer(ch))

, CSV , :

regx = re.compile('(?:(?<=\r\n\r\n)|(?<=\A)|(?<=\A\r\n))'
                  '(.+?)\r\n,(.+?)\r\n,(.+?)\r\n,(.+?)\r\n,(.+?)\r\n,([^\r\n]+)')
import csv

with open('csvtry3.txt','wb') as f:
    csvw = csv.writer(f,delimiter='#')
    for mat in regx.finditer(ch):
        csvw.writerow(mat.groups()) 

.

1

, tchrist, :

regx = re.compile('(?<!\r\n)\r\n')    
with open('csvtry.txt','wb') as f:
    f.write(regx.sub('',ch))

.

EDIT 2

:

with open('csvtry.txt','wb') as f:
    f.writelines(x.replace('\r\n','')+'\r\n' for x in ch.split('\r\n\r\n'))

.

3

, ch:

'a la gnibbler " , , :

from itertools import groupby
with open('csvinput.txt','r') as f,open('csvoutput.txt','w') as g:
    groups = groupby(f,key= lambda v: not str.isspace(v))
    g.writelines(''.join(x).replace('\n','')+'\n' for k,x in groups if k)

I have another regex solution:

import re
regx = re.compile('^((?:.+?\n)+?)(?=\n|\Z)',re.MULTILINE)
with open('input.txt','r') as f,open('csvoutput.txt','w') as g:
    g.writelines(mat.group().replace('\n','')+'\n' for mat in regx.finditer(f.read()))

I find it similar to a gnibbler-like solution

0
source
f = open(infilepath, 'r')
s = ''.join([line for line in f])
s = s.replace('\n\n', '\\n')
s = s.replace('\n', '')
s = s.replace("\\n", "\n")
f.close()

f = open(infilepath, 'r')
f.write(s)
f.close()

That should do it. It will replace your input file with a new format

-1
source

All Articles