Parsing words in (prefix, root, suffix) in Python

Question

Parsing words in (prefix, root, suffix) in Python

I am trying to create a simple parser for some text data. (The text is in the language in which NLTK does not have parsers for.)

Basically, I have a limited number of prefixes, which can be either one or two letters; A word can have more than one prefix. I also have a limited number of suffixes for one or two letters. Everything in between should be the "root" of the word. Many words will have more than one possible parsing, so I want to enter a word and return a list of possible analyzes in the form of a tuple (prefix, root, suffix).

I can't figure out how to structure the code. I inserted an example of how I tried (using some dummy English data to make it more understandable), but this is clearly not the case. First off, it's really ugly and redundant, so I'm sure there is a better way to do this. For another, it does not work with words that have more than one prefix or suffix, or both prefix (s) and suffix (s).

Any thoughts?

prefixes = ['de','con']
suffixes = ['er','s']

def parser(word):
    poss_parses = []
    if word[0:2] in prefixes:
        poss_parses.append((word[0:2],word[2:],''))
    if word[0:3] in prefixes:
        poss_parses.append((word[0:3],word[3:],''))
    if word[-2:-1] in prefixes:
        poss_parses.append(('',word[:-2],word[-2:-1]))
    if word[-3:-1] in prefixes:
        poss_parses.append(('',word[:-3],word[-3:-1]))
    if word[0:2] in prefixes and word[-2:-1] in suffixes and len(word[2:-2])>2:
        poss_parses.append((word[0:2],word[2:-2],word[-2:-1]))
    if word[0:2] in prefixes and word[-3:-1] in suffixes and len(word[2:-3])>2:
        poss_parses.append((word[0:2],word[2:-2],word[-3:-1]))
    if word[0:3] in prefixes and word[-2:-1] in suffixes and len(word[3:-2])>2:
        poss_parses.append((word[0:2],word[2:-2],word[-2:-1]))
    if word[0:3] in prefixes and word[-3:-1] in suffixes and len(word[3:-3])>2:
        poss_parses.append((word[0:3],word[3:-2],word[-3:-1]))
    return poss_parses



>>> wordlist = ['construct','destructer','constructs','deconstructs']
>>> for w in wordlist:
...   parses = parser(w)
...   print w
...   for p in parses:
...     print p
... 
construct
('con', 'struct', '')
destructer
('de', 'structer', '')
constructs
('con', 'structs', '')
deconstructs
('de', 'constructs', '')

+3

python parsing nlp

larapsodia Apr 14 '12 at 19:03

source share

3 answers

:

prefixes = ['de','con']
suffixes = ['er','s']

def parse(word):
    prefix = ''
    suffix = ''

    # find all prefixes
    found = True
    while found:
        found = False
        for p in prefixes:
            if word.startswith(p):
                prefix += p
                word = word[len(p):] # remove prefix from word
                found = True

    # find all suffixes
    found = True
    while found:
        found = False
        for s in suffixes:
            if word.endswith(s):
                suffix = s + suffix
                word = word[:-len(s)] # remove suffix from word
                found = True

    return (prefix, word, suffix)

print parse('construct')
print parse ('destructer')
print parse('deconstructs')
print parse('deconstructers')
print parse('deconstructser')
print parse('condestructser')

:

>>> 
('con', 'struct', '')
('de', 'struct', 'er')
('decon', 'struct', 's')
('decon', 'struct', 'ers')
('decon', 'struct', 'ser')
('conde', 'struct', 'ser')

, , . , , , , , .

, , .

+2

Israel Unterman 14 . '12 19:25

CodeChords , ( ), , , .

class Parser():
    PREFIXES = ['de', 'con']
    SUFFIXES = ['er', 's']
    MINUMUM_STEM_LENGTH = 3

    @classmethod
    def prefixes(cls, word, internal=False):
        stem = word
        prefix = None
        for potential_prefix in cls.PREFIXES:
            if word.startswith(potential_prefix):
                prefix = potential_prefix
                stem = word[len(prefix):]
                if len(stem) >= cls.MINUMUM_STEM_LENGTH:
                    break
                else:
                    prefix = None
                    stem = word
        if prefix:
            others, stem = cls.prefixes(stem, True)
            others.append(prefix)
            return (others, stem) if internal else (reversed(others), stem)
        else:
            return [], stem

    @classmethod
    def suffixes(cls, word):
        suffix = None
        stem = word
        for potential_suffix in cls.SUFFIXES:
            if word.endswith(potential_suffix):
                suffix = potential_suffix
                stem = word[:-len(suffix)]
                if len(stem) >= cls.MINUMUM_STEM_LENGTH:
                    break
                else:
                    suffix = None
                    stem = word
        if suffix:
            others, stem = cls.suffixes(stem)
            others.append(suffix)
            return others, stem
        else:
            return [], stem

    @classmethod
    def parse(cls, word):
        prefixes, word = cls.prefixes(word)
        suffixes, word = cls.suffixes(word)
        return(tuple(prefixes), word, tuple(suffixes))

words = ['con', 'deAAers', 'deAAAers', 'construct', 'destructer', 'constructs', 'deconstructs', 'deconstructers']

parser = Parser()
for word in words:
    print(parser.parse(word))

:

((), 'con', ())
(('de',), 'AAer', ('s',))
(('de',), 'AAA', ('er', 's'))
(('con',), 'struct', ())
(('de',), 'struct', ('er',))
(('con',), 'struct', ('s',))
(('de', 'con'), 'struct', ('s',))
(('de', 'con'), 'struct', ('er', 's'))

This works by taking a word and using a function str.startswith()to find prefixes. It does this recursively until it hides to a word without prefixes and then returns a list of prefixes.

He then does a similar thing for suffixes, except for use str.endswith()for obvious reasons.

+2

Gareth latty Apr 14 '12 at 19:42

source share

PaulMcG · Accepted Answer · 2012-04-14T22:28:52+0000

Pyparsing wraps string indexing and token extraction in its own parsing structure and allows you to use simple arithmetic syntax to create parsing definitions:

wordlist = ['construct','destructer','constructs','deconstructs']

from pyparsing import StringEnd, oneOf, FollowedBy, Optional, ZeroOrMore, SkipTo

endOfString = StringEnd()
prefix = oneOf("de con")
suffix = oneOf("er s") + FollowedBy(endOfString)

word = (ZeroOrMore(prefix)("prefixes") + 
        SkipTo(suffix | endOfString)("root") + 
        Optional(suffix)("suffix"))

for wd in wordlist:
    print wd
    res = word.parseString(wd)
    print res.dump()
    print res.prefixes
    print res.root
    print res.suffix
    print

ParseResults, , dict. :

construct
['con', 'struct']
- prefixes: ['con']
- root: struct
['con']
struct


destructer
['de', 'struct', 'er']
- prefixes: ['de']
- root: struct
- suffix: ['er']
['de']
struct
['er']

constructs
['con', 'struct', 's']
- prefixes: ['con']
- root: struct
- suffix: ['s']
['con']
struct
['s']

deconstructs
['de', 'con', 'struct', 's']
- prefixes: ['de', 'con']
- root: struct
- suffix: ['s']
['de', 'con']
struct
['s']

Parsing words in (prefix, root, suffix) in Python

More articles: