Finding a few words and printing the next line using Python

Question

Finding a few words and printing the next line using Python

I have a huge text file. It looks like this

> <Enzymologic: Ki nM 1>
 257000

> <Enzymologic: IC50 nM 1>
n/a

> <ITC: Delta_G0 kJ/mole 1>
n/a

> <Enzymologic: Ki nM 1>
5000

> <Enzymologic: EC50/IC50 nM 1>
1000

.....

Now I want to create a python script to find words of type ( > <Enzymologic: Ki nM 1>, > <Enzymologic: EC50/IC50 nM 1>) and print the next line for each word in tab delimited format as follows

> <Enzymologic: Ki nM 1>     > <Enzymologic: EC50/IC50 nM 1>
257000                       n/a
5000                         1000
....

I tried the following code

infile = path of the file
lines = infile.readlines()
infile.close()
searchtxt = "> <Enzymologic: IC50 nM 1>", "> <Enzymologic: Ki nM 1>"
for i, line in enumerate(lines): 
     if searchtxt in line and i+1 < len(lines):
         print lines[i+1]

But it does not work, any body can offer some code ... to achieve it.

Thanks in advance

+3

python

nit Jun 14 '11 at 12:20

source share

6 answers

Drtyrsa · Answer 1 · 2011-06-14T12:36:35+0000

s = '''Enzymologic: Ki nM 1

257000

Enzymologic: IC50 nM 1

n/a

ITC: Delta_G0 kJ/mole 1

n/a

Enzymologic: Ki nM 1

5000

Enzymologic: IC50 nM 1

1000'''
from collections import defaultdict

lines = [x for x in s.splitlines() if x]
keys = lines[::2]
values = lines[1::2]
result = defaultdict(list)
for key, value in zip(keys, values):
    result[key].append(value)
print dict(result)

>>> {'ITC: Delta_G0 kJ/mole 1': ['n/a'], 'Enzymologic: Ki nM 1': ['257000', '5000'], 'Enzymologic: IC50 nM 1': ['n/a', '1000']}

Then format the output as you like.

Emmanuel · Answer 2 · 2011-06-14T12:37:14+0000

I think your problem is related to what you are doing if searchtxt in lineinstead of doing if pattern in linefor everyone patternin yours searchtxt. Here is what I would do:

>>> path = 'D:\\temp\\Test.txt'
>>> lines = open(path).readlines()
>>> searchtxt = "Enzymologic: IC50 nM 1", "Enzymologic: Ki nM 1"
>>> from collections import defaultdict
>>> dict_patterns = defaultdict(list)
>>> for i, line in enumerate(lines):
    for pattern in searchtxt:
        if pattern in line and i+1 < len(lines):
             dict_patterns[pattern].append(lines[i+1])

>>> dict_patterns
defaultdict(<type 'list'>, {'Enzymologic: Ki nM 1': ['257000\n', '5000\n'],
                            'Enzymologic: IC50 nM 1': ['n/a\n', '1000']})

dict (defaultdict - ).

shang · Answer 3 · 2011-06-14T14:08:28+0000

import itertools

def search(lines, terms):
    results = [[t] for t in terms]
    lines = iter(lines)
    for l in lines:
        for i,t in enumerate(terms):
            if t in l:
                results[i].append(lines.next().strip())
                break
    return results

def format(results):
    s = []
    rows = list(itertools.izip_longest(*results, fillvalue=""))
    for row in rows:
        s.append("\t".join(row))
        s.append('\n')
    return ''.join(s)

:

example = """> <Enzymologic: Ki nM 1>
257000

> <Enzymologic: IC50 nM 1>
n/a

> <ITC: Delta_G0 kJ/mole 1>
n/a

> <Enzymologic: Ki nM 1>
5000

> <Enzymologic: EC50/IC50 nM 1>
1000"""

def test():
    terms = ["> <Enzymologic: IC50 nM 1>", "> <Enzymologic: Ki nM 1>"]
    lines = example.split('\n')
    result = search(lines, terms)
    print format(result)

>>> test()
> <Enzymologic: IC50 nM 1>   > <Enzymologic: Ki nM 1>
n/a 257000

. ( ), :

import math

def format(results):
    maxcolwidth = [0] * len(results)
    rows = list(itertools.izip_longest(*results, fillvalue=""))
    for row in rows:
        for i,col in enumerate(row):
            w = int(math.ceil(len(col)/8.0))*8
            maxcolwidth[i] = max(maxcolwidth[i], w)

    s = []
    for row in rows:
        for i,col in enumerate(row):
            s += col
            padding = maxcolwidth[i]-len(col)
            tabs = int(math.ceil(padding/8.0))
            s += '\t' * tabs
        s += '\n'

    return ''.join(s)

e-satis · Answer 4 · 2011-06-14T14:54:17+0000

:

import itertools

# let imitate a file
pseudo_file = """
> <Enzymologic: Ki nM 1>
 257000

> <Enzymologic: IC50 nM 1>
n/a

> <ITC: Delta_G0 kJ/mole 1>
n/a

> <Enzymologic: Ki nM 1>
5000

> <Enzymologic: EC50/IC50 nM 1>
1000
""".split('\n')

def iterate_on_couple(iterable):
  """
    Iterate on two elements, by two elements
  """
  iterable = iter(iterable)
  for x in iterable:
    yield x, next(iterable)

plain_lines = (l for l in pseudo_file  if l.strip()) # ignore empty lines

results = {}

# store all results in a dictionary
for name, value in iterate_on_couple(plain_lines):
  results.setdefault(name, []).append(value)

# now you got a dictionary with all values linked to a name
print results

, :

, .

-, , , . - , shelve sqlite.

import csv

def get(iterable, index, default):
  """
    Return an item from array or default if IndexError
  """
  try:
      return iterable[index]
  except IndexError:
      return default

names = results.keys() # get a list of all names

# now we write our tab separated file using the csv module
out = csv.writer(open('/tmp/test.csv', 'w'), delimiter='\t')

# first the header
out.writerow(names)

# get the size of the longest column
max_size = list(reversed(sorted(len(results[name]) for name in names)))[0]

# then write the lines one by one
for i in xrange(max_size):
    line = [get(results[name], i, "-") for name in names]
    out.writerow(line)

, Python, .

eyquem · Answer 5 · 2011-06-14T20:26:31+0000

import re

pseudo_file = """
> <Enzymologic: Ki nM 1>
 257000

> <Enzymologic: IC50 nM 1>
n/a

> <ITC: Delta_G0 kJ/mole 1>
n/a

> <Enzymologic: Ki nM 1>
5000

> <Enzymologic: EC50/IC50 nM 1>
1000"""

searchtxt = "nzymologic: Ki nM 1>", "<Enzymologic: IC50 nM 1>"

regx_AAA = re.compile('([^:]+: )([^ \t]+)(.*)')

tu = tuple(regx_AAA.sub('\\1.*?\\2.*?\\3',x) for x in searchtxt)

model = '%%-%ss  %%s\n' % len(searchtxt[0])

regx_BBB = re.compile(('%s[ \t\r\n]+(.+)[ \t\r\n]+'
                       '.+?%s[ \t\r\n]+(.+?)[ \t]*(?=\r?\n|\Z)') % tu)


print 'tu   ==',tu
print 'model==',model
print 'regx_BBB.findall(pseudo_file)==\n',regx_BBB.findall(pseudo_file)



with open('woof.txt','w') as f:
    f.write(model % searchtxt)
    f.writelines(model % x for x in regx_BBB.findall(pseudo_file))

tu   == ('nzymologic: .*?Ki.*? nM 1>', '<Enzymologic: .*?IC50.*? nM 1>')
model== %-20s  %s

regx_BBB.findall(pseudo_file)==
[('257000', 'n/a'), ('5000', '1000')]

woof.txt:

> <Enzymologic: Ki nM 1>  > <Enzymologic: IC50 nM 1>
257000                    n/a
5000                      1000

regx_BBB, tu, , " > " searchtxt

, tu . *? searchtxt, regx_BBB , IC50, , searchtxt

, "nzymologic: Ki nM 1>" "<Enzymologic: IC50 nM 1>" searchtxt, , , , , .

, : : searchtxt

.

1

, '> <Enzymologic: IC50 nM 1>' '> <Enzymologic: EC50/IC50 nM 1>' '> <Enzymologic: Ki nM 1>'

, , , ( : )

, regx_BBB:

regx_AAA = re.compile('([^:]+: )([^ \t]+)(.*)')

li = [ regx_AAA.sub('\\1.*?\\2.*?\\3',x) for x in searchtxt]

regx_BBB = re.compile('|'.join(li).join('()') + '[ \t\r\n]+(.+?)[ \t]*(?=\r?\n|\Z)')

But formatting the recording file will be more difficult. I'm tired of writing new complete code, not knowing what exactly is needed

jkdba · Answer 6 · 2013-09-09T18:36:34+0000

Probably the easiest way to find a line in a line and then print the following line is to use itertools islice:

    from itertools import islice
    searchtxt = "<Enzymologic: IC50 nM 1>"
    with open ('file.txt','r') as itfile:
            for line in itfile:
                    if searchtxt in line:
                            print line
                            print ''.join(islice(itfile,1)

Finding a few words and printing the next line using Python

1

More articles: