Better way to clean a string?

OK, consider the following test:

public class CleanString
{
    //by MSDN http://msdn.microsoft.com/en-us/library/844skk0h(v=vs.71).aspx
    public static string UseRegex(string strIn)
    {
        // Replace invalid characters with empty strings.
        return Regex.Replace(strIn, @"[^\w\.@-]", "");
    }

    // by Paolo Tedesco
    public static String UseStringBuilder(string strIn)
    {
        const string removeChars = " ?&^$#@!()+-,:;<>’\'-_*";
        // specify capacity of StringBuilder to avoid resizing
        StringBuilder sb = new StringBuilder(strIn.Length);
        foreach (char x in strIn.Where(c => !removeChars.Contains(c)))
        {
            sb.Append(x);
        }
        return sb.ToString();
    }

    // by Paolo Tedesco, but using a HashSet
    public static String UseStringBuilderWithHashSet(string strIn)
    {
        var hashSet = new HashSet<char>(" ?&^$#@!()+-,:;<>’\'-_*");
        // specify capacity of StringBuilder to avoid resizing
        StringBuilder sb = new StringBuilder(strIn.Length);
        foreach (char x in strIn.Where(c => !hashSet.Contains(c)))
        {
            sb.Append(x);
        }
        return sb.ToString();
    }

    // by SteveDog
    public static string UseStringBuilderWithHashSet2(string dirtyString)
    {
        HashSet<char> removeChars = new HashSet<char>(" ?&^$#@!()+-,:;<>’\'-_*");
        StringBuilder result = new StringBuilder(dirtyString.Length);
        foreach (char c in dirtyString)
            if (removeChars.Contains(c))
                result.Append(c);
        return result.ToString();
    }

    // original by patel.milanb
    public static string UseReplace(string dirtyString)
    {
        string removeChars = " ?&^$#@!()+-,:;<>’\'-_*";
        string result = dirtyString;

        foreach (char c in removeChars)
        {
            result = result.Replace(c.ToString(), string.Empty);
        }

        return result;
    }

    // by L.B
    public static string UseWhere(string dirtyString)
    {
        return new String(dirtyString.Where(Char.IsLetterOrDigit).ToArray());
    }
}

static class Program
{
    /// <summary>
    /// The main entry point for the application.
    /// </summary>
    [STAThread]
    static void Main()
    {
        var dirtyString = "sdfdf.dsf8908()=(=(sadfJJLef@ssyd€sdöf////fj()=/§(§&/(\"&sdfdf.dsf8908()=(=(sadfJJLef@ssyd€sdöf////fj()=/§(§&/(\"&sdfdf.dsf8908()=(=(sadfJJLef@ssyd€sdöf";
        var sw = new Stopwatch();

        var iterations = 50000;
        
        sw.Start();
        for (var i = 0; i < iterations; i++)
            CleanString.<SomeMethod>(dirtyString);
        sw.Stop();
        Debug.WriteLine("CleanString.<SomeMethod>: " + sw.ElapsedMilliseconds.ToString());
        sw.Reset();

        ....
        <repeat>
        ....       
    }
}

Output

CleanString.UseReplace: 791
CleanString.UseStringBuilder: 2805
CleanString.UseStringBuilderWithHashSet: 521
CleanString.UseStringBuilderWithHashSet2: 331
CleanString.UseRegex: 1700
CleanString.UseWhere: 233

Conclusion

It probably does not matter which method you use.

The difference in time between the fastest (UseWhere: 233ms) and the slowest (UseStringBuilder: 2805ms) method is 2572ms when called 50000 (!) times in a row. If you don’t run the method that often, the difference does not really matter.

But if performance is critical, use the UseWhere method (written by L.B). Note, however, that its behavior is slightly different.

Leave a Comment

Hata!: SQLSTATE[HY000] [1045] Access denied for user 'divattrend_liink'@'localhost' (using password: YES)