Iterate over R to check spelling of a word vector

I have a dataset filled with inadequately spaced sentences. I am trying to find a way to remove some spaces.

I start with a sentence that I convert to a frame of data words:

> word5 <- "hotter the doghou se would be bec ause the co lor was diffe rent"
> abc1 <- data.frame(filler1 = 1,words1=factor(unlist(strsplit(word5, split=" "))))
> abc1
   filler1 words1
1        1 hotter
2        1    the
3        1 doghou
4        1     se
5        1  would
6        1     be
7        1    bec
8        1   ause
9        1    the
10       1     co
11       1    lor
12       1    was
13       1  diffe
14       1   rent

Next, I use the following code to check and spell check and combine words that are a combination of words before or after them:

abc2 <- abc1
i <- 1
while(i < nrow(abc1)){
  print(abc2)
  if(nrow(aspell(abc1$words1[i])) == 0){
    print(paste(i,"Words OK",sep=" | "));flush.console() 
    i <- i + 1
  }
 else{
  if(nrow(aspell(abc1$words1[i])) > 0 & i != 1){
    preWord1 <- abc1$words1[i-1]
    postWord1 <- abc1$words1[i+1]
    badWord1 <- abc1$words1[i]
    newWord1 <- factor(paste(preWord1,badWord1,sep=""))
    newWord2 <- factor(paste(badWord1,postWord1,sep=""))

    if(nrow(aspell(newWord1)) == 0 & nrow(aspell(newWord2)) != 0){
      abc2[i,"words1"] <-as.character(newWord1)
      abc2 <- abc2[-c(i+1),]
      print(paste(i,"word1",sep=" | "));flush.console()
      i <- i + 1
    }

    if(nrow(aspell(newWord1)) != 0 & nrow(aspell(newWord2)) == 0){
      abc2[i ,"words1"] <-as.character(newWord2)
      abc2 <- abc2[-c(i-1),]
      print(paste(i,"word2",sep=" | "));flush.console()
      i <- i + 1
    }

  }
}
}

After playing with this for a while, I come to the conclusion that I need some type of iterator, but I'm not sure how to implement it in R. Any suggestions?

+5
source share
2 answers

. , . . , , , .

. - , wordpair. :

> word5
[1] "hotter the doghou se would be bec ause the col or was diffe rent"

:

> abc1_pairs
    word1  word2
1  hotter    the
2     the doghou
3  doghou     se
4      se  would
5   would     be
6      be    bec
7     bec   ause
8    ause    the
9     the    col
10    col     or
11     or    was
12    was  diffe
13  diffe   rent

wordpairs , , , ( , ):

# Recursively delete wordpairs which lead to a correct word
merge_wordpairs = function(wordpairs) {
  require(plyr)
  merged_pairs = as.character(mlply(wordpairs, merge_word))
  correct_words_idxs = which(sapply(merged_pairs, word_correct))
  if(length(correct_words_idxs) == 0) {
    return(wordpairs)
  } else {
    message(sprintf("Number of words about to be merged in this pass: %s", length(correct_words_idxs)))
    for(idx in correct_words_idxs) {
      wordpairs = merge_specific_pair(wordpairs, idx, delete_pair = FALSE)
    }
    return(merge_wordpairs(wordpairs[-correct_words_idxs,])) # recursive call
  }
}

:

> word5 <- "hotter the doghou se would be bec ause the col or was diffe rent"
> abc1 = strsplit(word5, split = " ")[[1]]
> abc1_pairs = wordlist2wordpairs(abc1)
> abc1_pairs
    word1  word2
1  hotter    the
2     the doghou
3  doghou     se
4      se  would
5   would     be
6      be    bec
7     bec   ause
8    ause    the
9     the    col
10    col     or
11     or    was
12    was  diffe
13  diffe   rent
> abc1_merged_pairs = merge_wordpairs(abc1_pairs)
Number of words about to be merged in this pass: 4
> merged_sentence = paste(wordpairs2wordlist(abc1_merged_pairs), collapse = " ")
> c(word5, merged_sentence)
[1] "hotter the doghou se would be bec ause the col or was diffe rent"
[2] "hotter the doghouse would be because the color was different"    

:

# A bunch of functions
# Data transformation
wordlist2wordpairs = function(word_list) {
  require(plyr)
  wordpairs = ldply(seq_len(length(word_list) - 1), 
                    function(idx) 
                      return(c(word_list[idx], 
                               word_list[idx+1])))
  names(wordpairs) = c("word1", "word2")
  return(wordpairs)
}
wordpairs2wordlist = function(wordpairs) {
  return(c(wordpairs[[1]], wordpairs[[2]][nrow(wordpairs)]))
}

# Some checking functions
# Is the word correct?
word_correct = function(word) return(nrow(aspell(factor(word))) == 0)
# Merge two words
merge_word = function(word1, word2) return(paste(word1, word2, sep = ""))

# Merge a specific pair, option to postpone deletion of pair
merge_specific_pair = function(wordpairs, idx, delete_pair = TRUE) {
  # merge pair into word
  merged_word = do.call("merge_word", wordpairs[idx,])
  # assign the pair to the idx above
  if(!(idx == 1)) wordpairs[idx - 1, "word2"] = merged_word
  if(!(idx == nrow(wordpairs))) wordpairs[idx + 1, "word1"] = merged_word
  # assign the pair to the index below (if not last one)
  if(delete_pair) wordpairs = wordpairs[-idx,]
  return(wordpairs)
}
+10

, . . , , . , . , , paste(word_before, word, word_after). . , .

# Wrap the spell checking in a function, makes your code much more readable
word_correct = function(word) return(nrow(aspell(factor(word))) == 0)
# Merge two words
merge_word = function(word1, word2) return(paste(word1, word2, sep = ""))
# Merge two words and replace in list
merge_words_in_list = function(word_list, idx1, idx2) {
  word_list[idx1] = merge_word(word_list[idx1], word_list[idx2])
  return(word_list[-idx2])
}
# Function that recursively combines words 
combine_words = function(word_list) {
  message("Current sentence: ", paste(word_list, collapse = " "))
  words_ok = sapply(word_list, word_correct)
  if(all(words_ok)) {
    return(word_list) 
  } else {
    first_wrong_word = which(!words_ok)[1]
    combination_before = merge_word(word_list[first_wrong_word], 
                                    word_list[first_wrong_word-1])
    if(word_correct(combination_before)) {
      word_list = merge_words_in_list(word_list, first_wrong_word-1, 
                                      first_wrong_word)
    }
    combination_after = merge_word(word_list[first_wrong_word], 
                                   word_list[first_wrong_word+1])
    if(word_correct(combination_after)) {
      word_list = merge_words_in_list(word_list, first_wrong_word, 
                                      first_wrong_word+1)
    }
    return(combine_words(word_list))  # Recursive call
  }
}

( ) :

word5 <- "hotter the doghou se would be bec ause the col or was diffe rent"
abc1 = strsplit(word5, split = " ")[[1]]
combine_words(abc1)
Current sentence: hotter the doghou se would be bec ause the col or was diffe rent
Current sentence: hotter the doghouse would be bec ause the col or was diffe rent
Current sentence: hotter the doghouse would be because the col or was diffe rent
Current sentence: hotter the doghouse would be because the col or was different

:

  • - , combination_before combination_after , . , .
  • , , , ?
  • , . "col" "or" aspell, . : , , .

, , , .

+3

All Articles