How to get all text between two specified tags using BeautifulSoup?
html = """
...
<tt class="descname">all</tt>
<big>(</big>
<em>iterable</em>
<big>)</big>
<a class="headerlink" href="#all" title="Permalink to this definition">ΒΆ</a>
...
"""
I want to get all the text between the bigupto start tag before the tag first appears a. This means that if I take this example, then I should get (iterable)as a string.
I would avoid nextSibling, since from your question you want to include everything until the next <a>, regardless of whether it is in a sibling, parent or child.
, , node, <a>, , . , , HTML , - :
from bs4 import BeautifulSoup
#by taking the `html` variable from the question.
html = BeautifulSoup(html)
firstBigTag = html.find_all('big')[0]
nextATag = firstBigTag.find_next('a')
def loopUntilA(text, firstElement):
text += firstElement.string
if (firstElement.next.next == nextATag):
return text
else:
#Using double next to skip the string nodes themselves
return loopUntilA(text, firstElement.next.next)
targetString = loopUntilA('', firstBigTag)
print targetString
.
from BeautifulSoup import BeautifulSoup as bs
from itertools import takewhile, chain
def get_text(html, from_tag, until_tag):
soup = bs(html)
for big in soup(from_tag):
until = big.findNext(until_tag)
strings = (node for node in big.nextSiblingGenerator() if getattr(node, 'text', '').strip())
selected = takewhile(lambda node: node != until, strings)
try:
yield ''.join(getattr(node, 'text', '') for node in chain([big, next(selected)], selected))
except StopIteration as e:
pass
for text in get_text(html, 'big', 'a'):
print text