Convert tabbed text to unmanaged html list?

I'm a beginner programmer, so this question may seem trivial: I have text files containing tab delimited text, for example:

A
    B
    C
        D
        E

Now I want to create unordered .html lists from this, with the structure:

<ul>
<li>A
<ul><li>B</li>
<li>C
<ul><li>D</li>
<li>E</li></ul></li></ul></li>
</ul>

My idea was to write a Python script, but if there is a simpler (automatic) way, that's good too. To determine the indentation level and element name, I would try using this code:

import sys
indent = 0
last = []
for line in sys.stdin:
    count = 0
    while line.startswith("\t"):
       count += 1
       line = line[1:]
    if count > indent:
       indent += 1
       last.append(last[-1])
    elif count < indent:
       indent -= 1
       last = last[:-1]
+5
source share
4 answers

tokenize : Python, . ElementTree module , , html:

from tokenize import NAME, INDENT, DEDENT, ENDMARKER, NEWLINE, generate_tokens
from xml.etree import ElementTree as etree

def parse(file, TreeBuilder=etree.TreeBuilder):
    tb = TreeBuilder()
    tb.start('ul', {})
    for type_, text, start, end, line in generate_tokens(file.readline):
        if type_ == NAME: # convert name to <li> item
            tb.start('li', {})
            tb.data(text)
            tb.end('li')
        elif type_ == NEWLINE:
            continue
        elif type_ == INDENT: # start <ul>
            tb.start('ul', {})
        elif type_ == DEDENT: # end </ul>
            tb.end('ul')
        elif type_ == ENDMARKER: # done
            tb.end('ul') # end parent list
            break
        else: # unexpected token
            assert 0, (type_, text, start, end, line)
    return tb.close() # return root element

, .start(), .end(), .data(), .close(), TreeBuilder, , html , .

stdin html stdout, ElementTree.write():

import sys

etree.ElementTree(parse(sys.stdin)).write(sys.stdout, method='html')

:

<ul><li>A</li><ul><li>B</li><li>C</li><ul><li>D</li><li>E</li></ul></ul></ul>

, sys.stdin/sys.stdout.

. stdout Python 3 sys.stdout.buffer encoding="unicode" - /Unicode.

+2

( ):

import itertools
def listify(filepath):
    depth = 0
    print "<ul>"*(depth+1)
    for line in open(filepath):
        line = line.rstrip()
        newDepth = sum(1 for i in itertools.takewhile(lambda c: c=='\t', line))
        if newDepth > depth:
            print "<ul>"*(newDepth-depth)
        elif depth > newDepth:
            print "</ul>"*(depth-newDepth)
        print "<li>%s</li>" %(line.strip())
        depth = newDepth
    print "</ul>"*(depth+1)

,

+5

, :

  • ( )

  • : emit <ul> <li>current item</li>

  • : emit <li>current item</li></ul>

  • : emit <li>current item</li>

OP

0

. , \t, \t +\t \t\t-\t \t.

, "in.txt" , . , . - . , .

JF Sebastian's solution is fine, but does not handle Unicode.

Create a text file "in.txt" encoded in UTF-8:

qqq
    www
    www
        
        
    
    
qqq
qqq

and run the script "ul.py". The script will create "out.html" and open it in Firefox.

#!/usr/bin/python
# -*- coding: utf-8 -*-

# The script exports a tabbed list from string into a HTML unordered list.

import io, subprocess, sys

f=io.open('in.txt', 'r',  encoding='utf8')
s=f.read()
f.close()

#---------------------------------------------

def ul(s):

    L=s.split('\n\n')

    s='<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">\n\
<html><head><meta content="text/html; charset=UTF-8" http-equiv="content-type"><title>List Out</title></head><body>'

    for p in L:
        e=''
        if p.find('\t') != -1:

            l=p.split('\n')
            depth=0
            e='<ul>'
            i=0

            for line in l:
                if len(line) >0:
                    a=line.split('\t')
                    d=len(a)-1

                    if depth==d:
                        e=e+'<li>'+line+'</li>'


                    elif depth < d:
                        i=i+1
                        e=e+'<ul><li>'+line+'</li>'
                        depth=d


                    elif depth > d:
                        e=e+'</ul>'*(depth-d)+'<li>'+line+'</li>'
                        depth=d
                        i=depth


            e=e+'</ul>'*i+'</ul>'
            p=e.replace('\t','')

            l=e.split('<ul>')
            n1= len(l)-1

            l=e.split('</ul>')
            n2= len(l)-1

            if n1 != n2:
                msg='<div style="color: red;">Wrong bullets position.<br>&lt;ul&gt;: '+str(n1)+'<br>&lt;&frasl;ul&gt;: '+str(n2)+'<br> Correct your source.</div>'
                p=p+msg

        s=s+p+'\n\n'

    return s

#-------------------------------------      

def detach(cmd):
    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
    sys.exit()

s=ul(s)

f=io.open('out.html', 'w',  encoding='utf8')
s=f.write(s)
f.close()

cmd='firefox out.html'
detach(cmd)

HTML will be:

<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<html><head><meta content="text/html; charset=UTF-8" http-equiv="content-type"><title>List Out</title></head><body><ul><li>qqq</li><ul><li>www</li><li>www</li><ul><li></li><li></li></ul><li></li><li></li></ul><li>qqq</li><li>qqq</li></ul>
-1
source

All Articles