Python thread extraction

The standard library of many programming languages ​​includes a "scanner API" to extract strings, numbers, or other objects from text input streams. (For example, Java includes a class Scanner, C ++ includes istream, and C includes scanf).

What is equivalent to this in Python?

Python has a stream interface, that is, classes that inherit from io.IOBase. However, the Python stream interface TextIOBaseprovides linear input capabilities. After reading the documentation and searching on Google , t find something in the standard Python modules that will allow me, for example, to extract an integer from a text stream or to extract the next word with delimiters as a string. Are there any standard tools for this?

+5
source share
2 answers

fscanf Java Scanner. , , .

, , , , . python, pyparsing. scanf, 2008 .

, . , , , . , - :

import re


FORMATS_TYPES = {
    'd': int,
    'f': float,
    's': str,
}


FORMATS_REGEXES = {    
    'd': re.compile(r'(?:\s|\b)*([+-]?\d+)(?:\s|\b)*'),
    'f': re.compile(r'(?:\s|\b)*([+-]?\d+\.?\d*)(?:\s|\b)*'),
    's': re.compile(r'\b(\w+)\b'),
}


FORMAT_FIELD_REGEX = re.compile(r'%(s|d|f)')


def scan_input(format_string, stream, max_size=float('+inf'), chunk_size=1024):
    """Scan an input stream and retrieve formatted input."""

    chunk = ''
    format_fields = format_string.split()[::-1]
    while format_fields:
        fields = FORMAT_FIELD_REGEX.findall(format_fields.pop())
        if not chunk:
            chunk = _get_chunk(stream, chunk_size)

        for field in fields:
            field_regex = FORMATS_REGEXES[field]
            match = field_regex.search(chunk)
            length_before = len(chunk)
            while match is None or match.end() >= len(chunk):
                chunk += _get_chunk(stream, chunk_size)
                if not chunk or length_before == len(chunk):
                    if match is None:
                        raise ValueError('Missing fields.')
                    break
            text = match.group(1)
            yield FORMATS_TYPES[field](text)
            chunk = chunk[match.end():]



def _get_chunk(stream, chunk_size):
    try:
        return stream.read(chunk_size)
    except EOFError:
        return ''

:

>>> s = StringIO('1234 Hello World -13.48 -678 12.45')
>>> for data in scan_input('%d %s %s %f %d %f', s): print repr(data)
...                                                                                            
1234                                                                                           
'Hello'
'World'
-13.48
-678
12.45

, , , .

+3

( ). (. re).

:

# matching first integer (space delimited)
re.match(r'\b(\d+)\b',string)

# matching first space delimited word
re.match(r'\b(\w+)\b',string)

# matching a word followed by an integer (space delimited)
re.match(r'\b(\w+)\s+(\d+)\b',string)

, C-, . - , .

+1

All Articles