English string dictionary collocation

I am trying to solve the problem of determining the best match of English words from a dictionary file to a given string.

For example ("strings", which are a list of dictionary words):

string testStr = "cakeday";

for (int x= 0; x<= testStr.Length; x++)
{
  string test = testStr.Substring(x);

   if (test.Length > 0)
   {
      string test2 = testStr.Remove(counter);
      int count = (from w in lines where w.Equals(test) || w.Equals(test2) select w).Count();
      Console.WriteLine("Test: {0} / {1} : {2}", test, test2, count);
    }
}

Gives output:

Test: cakeday /   : 0
Test: akeday / c  : 1
Test: keday / ca  : 0
Test: eday / cak  : 0
Test: day / cake  : 2
Test: ay / caked  : 1
Test: y / cakeda  : 1

Obviously, a day / cake is best for a string, however, if I have to enter the third word in a string, for example, "cakedaynow", it does not work so well.

I know that the example is primitive, all the more so as a proof of concept and wondered if anyone has any experience analyzing this type of string?

Thank!

+5
source share
2 answers

, , . .

, Levenehtein Edit Distance #, :

using System;

namespace StringMatching
{
    /// <summary>
    /// A class to extend the string type with a method to get Levenshtein Edit Distance.
    /// </summary>
    public static class LevenshteinDistanceStringExtension
    {
        /// <summary>
        /// Get the Levenshtein Edit Distance.
        /// </summary>
        /// <param name="strA">The current string.</param>
        /// <param name="strB">The string to determine the distance from.</param>
        /// <returns>The Levenshtein Edit Distance.</returns>
        public static int GetLevenshteinDistance(this string strA, string strB)
        {
            if (string.IsNullOrEmpty(strA) && string.IsNullOrEmpty(strB))
                return 0;

            if (string.IsNullOrEmpty(strA))
                return strB.Length;

            if (string.IsNullOrEmpty(strB))
                return strA.Length;

            int[,] deltas; // matrix
            int lengthA;
            int lengthB;
            int indexA;
            int indexB;
            char charA;
            char charB;
            int cost; // cost

            // Step 1
            lengthA = strA.Length;
            lengthB = strB.Length;

            deltas = new int[lengthA + 1, lengthB + 1];

            // Step 2
            for (indexA = 0; indexA <= lengthA; indexA++)
            {
                deltas[indexA, 0] = indexA;
            }

            for (indexB = 0; indexB <= lengthB; indexB++)
            {
                deltas[0, indexB] = indexB;
            }

            // Step 3
            for (indexA = 1; indexA <= lengthA; indexA++)
            {
                charA = strA[indexA - 1];

                // Step 4
                for (indexB = 1; indexB <= lengthB; indexB++)
                {
                    charB = strB[indexB - 1];

                    // Step 5
                    if (charA == charB)
                    {
                        cost = 0;
                    }
                    else
                    {
                        cost = 1;
                    }

                    // Step 6
                    deltas[indexA, indexB] = Math.Min(deltas[indexA - 1, indexB] + 1, Math.Min(deltas[indexA, indexB - 1] + 1, deltas[indexA - 1, indexB - 1] + cost));
                }
            }

            // Step 7
            return deltas[lengthA, lengthB];
        }
    }
}
+2

:

, , . :.

var list = new List<string>{"the", "me", "cat", "at", "theme"};
const string testStr = "themecat";
var words = new List<string>();
var len = testStr.Length;
for (int x = 0; x < len; x++)
{
    for(int i = (len - 1); i > x; i--)
    {
        string test = testStr.Substring(x, i - x + 1);
        if (list.Contains(test) && !words.Contains(test))
        {
            words.Add(test);
        }
    }
}

words.ForEach(n=> Console.WriteLine("{0}, ",n));//spit out current values

:

, , ,

Live Scenario 1: , , , , , . , , , , , .

:

var list = new List<string>{"the", "me", "cat", "at", "theme", "crying", "them"};
const string testStr = "themecatcryingthem";
var words = new Dictionary<int, string>();
var len = testStr.Length;
for (int x = 0; x < len; x++)
{
    int n = len > 28 ? 28 : len;//assuming 28 is the maximum length of an english word
    for(int i = (n - 1); i > x; i--)
    {
        string test = testStr.Substring(x, i - x + 1);
        if (list.Contains(test))
        {
            if (!words.ContainsValue(test))
            {
                bool found = false;//to check if there a shorter item starting from same index
                var key = testStr.IndexOf(test, x, len - x);
                foreach (var w in words)
                {
                    if (w.Value.Contains(test) && w.Key != key && key == (w.Key + w.Value.Length - test.Length))
                    {
                        found = true;
                    }
                }
                if (!found && !words.ContainsKey(key)) words.Add(key, test);
            }
        }
    }
}

words.Values.ToList().ForEach(n=> Console.WriteLine("{0}, ",n));//spit out current values

:

, , ,

+1

All Articles