I created a solution that reads a large csv file currently in size of 20-30 mb, I tried to delete duplicate rows based on certain column values that the user selects at runtime using the usual technique of finding duplicate rows but it is so slow that it seems that the program does not work at all.
What other method can be used to remove duplicate entries from a csv file
Here's the code, definitely I'm doing something wrong
DataTable dtCSV = ReadCsv (file, columns);
// columns is a list of string List column
DataTable dt = RemoveDuplicateRecords (dtCSV, columns);
private DataTable RemoveDuplicateRecords (DataTable dtCSV, List <string> columns)
{
DataView dv = dtCSV.DefaultView;
string RowFilter = string.Empty;
if (dt == null)
dt = dv.ToTable (). Clone ();
DataRow row = dtCSV.Rows [0];
foreach (DataRow row in dtCSV.Rows)
{
try
{
RowFilter = string.Empty;
foreach (string column in columns)
{
string col = column;
RowFilter + = "[" + col + "]" + "= '" + row [col] .ToString (). Replace ("'", "''") + "'and";
}
RowFilter = RowFilter.Substring (0, RowFilter.Length - 4);
dv.RowFilter = RowFilter;
DataRow dr = dt.NewRow ();
bool result = RowExists (dt, RowFilter);
if (! result)
{
dr.ItemArray = dv.ToTable (). Rows [0] .ItemArray;
dt.Rows.Add (dr);
}
}
catch (Exception ex)
{
}
}
return dt;
}
source
share