MySQL SELECT DISTINCT with variation tolerance

in my database I have many records that are very similar but not identical. For example, only two characters may differ, for example:

Row1: "The weather is nice, see http://xyz56.com "

Row2: "The weather is nice, see http://xyz31.com "

I would like to get rid of these partial duplicates and just get one result for these two lines. No matter what it is, I would suggest using the first one that appears.

Is there any function I could use from MySQL to do this efficiently? My first thought was to pull out more data and do a line-by-line comparison if matching characters exceed a certain threshold than ignore it. The disadvantage is that I will never know how many records I have to extract from the database, and this is also inefficient, since I have to compare each row with all the other rows (O (n²)).

Update: To be more specific in use cases: the deviation position is not always at the end of the line, and it can be no more than two characters that change. The length of the line depends on each line.

+3
source share
4 answers
+4

SOUNDEX.

SOUNDEX(str)
Returns a soundex string from str. Two strings that sound almost the same should have identical soundex strings. A standard soundex string is four characters long, but the SOUNDEX() function returns an arbitrarily long string. You can use SUBSTRING() on the result to get a standard soundex string. All non-alphabetic characters in str are ignored. All international alphabetic characters outside the A-Z range are treated as vowels.

mysql> SELECT SOUNDEX('Hello');
+---------------------------------------------------------+
| SOUNDEX('Hello')                                        |
+---------------------------------------------------------+
| H400                                                    |
+---------------------------------------------------------+
1 row in set (0.00 sec)

: http://www.tutorialspoint.com/mysql/mysql-string-functions.htm#operator_sounds-like

, Oracle PL/SQL SOUNDEX SOUNDEX :

select soundex ('The weather is nice, see http://xyz56.com') from dual;

SOUNDEX('THEWEATHERISNICE,SEEHTTP://XYZ56.COM')
-----------------------------------------------
T362                                           
1 row selected.

select soundex ('The weather is nice, see http://xyz31.com') from dual;

SOUNDEX('THEWEATHERISNICE,SEEHTTP://XYZ31.COM')
-----------------------------------------------
T362                                           
1 row selected.
+2

SELECT * FROM test GROUP BY SUBSTR(mytext, 1, 10);
+2

MySQL:

: mysql/ ?

- , , , . MySQL http://www.codejanitor.com/wp/.

CREATE FUNCTION levenshtein( s1 VARCHAR(255), s2 VARCHAR(255) ) 
  RETURNS INT 
  DETERMINISTIC 
  BEGIN 
    DECLARE s1_len, s2_len, i, j, c, c_temp, cost INT; 
    DECLARE s1_char CHAR; 
    -- max strlen=255 
    DECLARE cv0, cv1 VARBINARY(256); 
    SET s1_len = CHAR_LENGTH(s1), s2_len = CHAR_LENGTH(s2), cv1 = 0x00, j = 1, i = 1, c = 0; 
    IF s1 = s2 THEN 
      RETURN 0; 
    ELSEIF s1_len = 0 THEN 
      RETURN s2_len; 
    ELSEIF s2_len = 0 THEN 
      RETURN s1_len; 
    ELSE 
      WHILE j <= s2_len DO 
        SET cv1 = CONCAT(cv1, UNHEX(HEX(j))), j = j + 1; 
      END WHILE; 
      WHILE i <= s1_len DO 
        SET s1_char = SUBSTRING(s1, i, 1), c = i, cv0 = UNHEX(HEX(i)), j = 1; 
        WHILE j <= s2_len DO 
          SET c = c + 1; 
          IF s1_char = SUBSTRING(s2, j, 1) THEN  
            SET cost = 0; ELSE SET cost = 1; 
          END IF; 
          SET c_temp = CONV(HEX(SUBSTRING(cv1, j, 1)), 16, 10) + cost; 
          IF c > c_temp THEN SET c = c_temp; END IF; 
            SET c_temp = CONV(HEX(SUBSTRING(cv1, j+1, 1)), 16, 10) + 1; 
            IF c > c_temp THEN  
              SET c = c_temp;  
            END IF; 
            SET cv0 = CONCAT(cv0, UNHEX(HEX(c))), j = j + 1; 
        END WHILE; 
        SET cv1 = cv0, i = i + 1; 
      END WHILE; 
    END IF; 
    RETURN c; 
  END; 

:

CREATE FUNCTION levenshtein_ratio( s1 VARCHAR(255), s2 VARCHAR(255) ) 
  RETURNS INT 
  DETERMINISTIC 
  BEGIN 
    DECLARE s1_len, s2_len, max_len INT; 
    SET s1_len = LENGTH(s1), s2_len = LENGTH(s2); 
    IF s1_len > s2_len THEN  
      SET max_len = s1_len;  
    ELSE  
      SET max_len = s2_len;  
    END IF; 
    RETURN ROUND((1 - LEVENSHTEIN(s1, s2) / max_len) * 100); 
  END; 

: Oracle PL/SQL

: http://www.merriampark.com/ldplsql.htm

CREATE OR REPLACE FUNCTION ld -- Levenshtein distance
  (p_source_string   IN VARCHAR2,
   p_target_string   IN VARCHAR2)
  RETURN                NUMBER
  DETERMINISTIC
AS
  v_length_of_source    NUMBER := NVL (LENGTH (p_source_string), 0);
  v_length_of_target    NUMBER := NVL (LENGTH (p_target_string), 0);
  TYPE mytabtype IS     TABLE OF NUMBER INDEX BY BINARY_INTEGER;
  column_to_left        mytabtype;
  current_column        mytabtype;
  v_cost                NUMBER := 0;
BEGIN
  IF v_length_of_source = 0 THEN
    RETURN v_length_of_target;
  ELSIF v_length_of_target = 0 THEN
    RETURN v_length_of_source;
  ELSE
    FOR j IN 0 .. v_length_of_target LOOP
      column_to_left(j) := j;
    END LOOP;
    FOR i IN 1.. v_length_of_source LOOP
      current_column(0) := i;
      FOR j IN 1 .. v_length_of_target LOOP
        IF SUBSTR (p_source_string, i, 1) =
           SUBSTR (p_target_string, j, 1)
        THEN v_cost := 0;
        ELSE v_cost := 1;
        END IF;
        current_column(j) := LEAST (current_column(j-1) + 1,
                                    column_to_left(j) + 1,
                                    column_to_left(j-1) + v_cost);
      END LOOP;
      FOR j IN 0 .. v_length_of_target  LOOP
        column_to_left(j) := current_column(j);
      END LOOP;
    END LOOP;
  END IF;
  RETURN current_column(v_length_of_target);
END ld;

EMPLOYEES FIRST_NAME VARCHAR2, , Levenshtein Distance = 1 :

SELECT *
  FROM employees alfa
 WHERE EXISTS
          (SELECT 'X'
             FROM employees beta
            WHERE ld (beta.first_name, alfa.first_name) = 1);

first_name Levenshtein Distance = 1:

SELECT a.first_name, b.first_name
  FROM    employees a
       INNER JOIN
          employees b
       ON ld (b.first_name, a.first_name) = 1;

:

SELECT DISTINCT a.first_name, b.first_name
  FROM    employees a
       INNER JOIN
          employees b
       ON ld (b.first_name, a.first_name) <= 2
          AND ld (b.first_name, a.first_name) > 0;

FIRST_NAME;FIRST_NAME_1
Jean;John
Nancy;Vance
Alana;Allan
Alana;Clara
Ellen;Eleni
John;Jean
Daniel;Danielle
Danielle;Daniel
Shelley;Shelli
Sundita;Nandita
Lisa;Luis
Stephen;Steven
Nanette;Janette
Diana;Alana
TJ;Ki
Luis;Lisa
Sarath;Sarah
Louise;Luis
Ki;TJ
Allan;Ellen
Luis;Louise
Den;Lex
Clara;Alana
Matthew;Mattea
Shelli;Shelley
Sarah;Sarath
Girard;Gerald
Vance;Nancy
Mattea;Martha
Allan;Alana
Nandita;Sundita
Ellen;Allan
Jean;Den
Eleni;Ellen
Gerald;Girard
Lex;Den
Janette;Nanette
Steven;Stephen
Mattea;Matthew
Den;Jean
Martha;Mattea
Alana;Diana
0
source

All Articles