I process large xml files with lxml.iterparse. This works well, but as my files got a lot bigger lately, I found that iterparse behavior filled my memory. Consider the following code that writes a file with 300,000 elements of the same sort and 300,000 elemelements and 300,000 other_elemelements:
els = ('<elem><subel1>{0}</subel1><subel2>{0}</subel2><subel3>{0}</subel3><subel4>{0}</subel4><subel5>{0}</subel5><subel6>{0}</subel6></elem>'.format(x) for x in range(300000))
other_els = ('<other_elem><subel1>{0}</subel1><subel2>{0}</subel2><subel3>{0}</subel3><subel4>{0}</subel4><subel5>{0}</subel5><subel6>{0}</subel6></other_elem>'.format(x) for x in range(300000))
with open('/tmp/test.xml', 'w') as fp:
fp.write('<root>\n')
fp.write('<elements>\n')
for el in els:
fp.write(el+'\n')
fp.write('</elements>\n')
fp.write('<other_elements>\n')
for el in other_els:
fp.write(el+'\n')
fp.write('</other_elements>\n')
fp.write('</root>\n')
Then I use the following to analyze only elem(and do nothing with them), occasionally using memory memory:
from lxml import etree
import psutil
import os
process = psutil.Process(os.getpid())
gen = etree.iterparse('/tmp/test.xml', tag='elem')
elscount = 0
for ac,el in gen:
elscount += 1
el.clear()
if el.getprevious() is not None:
del(el.getparent()[0])
if elscount % 10000 == 0:
print process.get_memory_info().rss/(1024*1024)
print process.get_memory_info().rss/(1024*1024)
, . , , other_elem s. , tag iterparse if , , , el.clear() , . , , , , , , ?