The fastest (portable) way to split an array in C #

I am writing a fully-managed Mercurial library (which will be used in a fully-managed Mercurial Server for Windows in the near future) and one of the most serious performance problems that I encounter, oddly enough, is splitting the array in parts.

The idea is this: there is an array of bytes ranging in size from a few hundred bytes to a megabyte, and all I need to do is split it into parts, limited to \ncharacters in my particular case .

Now that dotTrace shows me that my “optimized” version Split (the code is correct, here is the naive version I started with) takes 11 seconds for 2300 calls (there is an obvious performance success introduced by dotTrace itself, but to the point).

Here are the numbers:

  • unsafeversion: 11 297ms for 2 312calls
  • managed ("naive") version: 20 001ms for 2 312calls

So here's what: there will be the fastest (preferably portable, i.e. supporting both x86 and x64) way to split an array into C #.

+5
source share
3 answers

, , . , . .

, , , .

public static unsafe Segment[] Split2(byte[] _src, byte value)
{
    var _ln = _src.Length;

    if (_ln == 0) return new Segment[] { };

    fixed (byte* src = _src)
    {
        var segments = new LinkedList<Segment>(); // Segment[c];

        byte* last = src;
        byte* end = src + _ln - 1;
        byte lastValue = *end;
        *end = value; // value-termination

        var cur = src;
        while (true)
        {
            if (*cur == value)
            {
                int begin = (int) (last - src);
                int length = (int) (cur - last + 1);
                segments.AddLast(new Segment(_src, begin, length));

                last = cur + 1;

                if (cur == end)
                {
                    if (lastValue != value)
                    {
                        *end = lastValue;
                    }
                    break;
                }
            }
            cur++;
        }

        return segments.ToArray();
    }
}

: , .

+4

Split 32- , uint. , : 32-, 64-.

, .

. , .

:

ToString: "(" + Offset.ToString() + "," + Length.ToString() + ")";

GetHashCode: ( * b = [])


, . : , , .

class ArraySplitter
{
    private byte[] m_data;
    private int    m_count;
    private int[]  m_stops;

    private void AddRange(int start, int stop)
    {
        // Skip empty range
        if (start > stop)
        {
            return;
        }

        // Grow array if needed
        if ((m_stops == null) || (m_stops.Length < (m_count + 2)))
        {
            int[] old = m_stops;

            m_stops = new int[m_count * 3 / 2 + 4];

            if (old != null)
            {
                old.CopyTo(m_stops, 0);
            }
        }

        m_stops[m_count++] = start;
        m_stops[m_count++] = stop;
    }

    public int Split(byte[] data, byte sep)
    {
        m_data  = data;
        m_count = 0;      // reuse m_stops

        int last = 0;

        for (int i = 0; i < data.Length; i ++)
        {
            if (data[i] == sep)
            {
                AddRange(last, i - 1);
                last = i + 1;
            }
        }

        AddRange(last, data.Length - 1);

        return m_count / 2;
    }

    public ArraySegment<byte> this[int index]
    {
        get
        {
            index *= 2;
            int start = m_stops[index];

            return new ArraySegment<byte>(m_data, start, m_stops[index + 1] - start + 1);
        }
    }
}

:

    static void Main(string[] args)
    {
        int count = 1000 * 1000;

        byte[] data = new byte[count];

        for (int i = 0; i < count; i++)
        {
            data[i] = (byte) i;
        }

        Stopwatch watch = new Stopwatch();

        for (int r = 0; r < 10; r++)
        {
            watch.Reset();
            watch.Start();

            int len = 0;

            foreach (var seg in data.MySplit(13))
            {
                len += seg.Count;
            }

            watch.Stop();

            Console.WriteLine("MySplit      : {0} {1,8:N3} ms", len, watch.Elapsed.TotalMilliseconds);

            watch.Reset();
            watch.Start();

            ArraySplitter splitter = new ArraySplitter();

            int parts = splitter.Split(data, 13);

            len = 0;

            for (int i = 0; i < parts; i++)
            {
                len += splitter[i].Count;
            }

            watch.Stop();
            Console.WriteLine("ArraySplitter: {0} {1,8:N3} ms", len, watch.Elapsed.TotalMilliseconds);
        }
    }

:

MySplit      : 996093    9.514 ms
ArraySplitter: 996093    4.754 ms
MySplit      : 996093    7.760 ms
ArraySplitter: 996093    2.710 ms
MySplit      : 996093    8.391 ms
ArraySplitter: 996093    3.510 ms
MySplit      : 996093    9.677 ms
ArraySplitter: 996093    3.468 ms
MySplit      : 996093    9.685 ms
ArraySplitter: 996093    3.370 ms
MySplit      : 996093    9.700 ms
ArraySplitter: 996093    3.425 ms
MySplit      : 996093    9.669 ms
ArraySplitter: 996093    3.519 ms
MySplit      : 996093    9.844 ms
ArraySplitter: 996093    3.416 ms
MySplit      : 996093    9.721 ms
ArraySplitter: 996093    3.685 ms
MySplit      : 996093    9.703 ms
ArraySplitter: 996093    3.470 ms
+3

, , , , -, , . HgSharp bitbucket.org, HgLab. , , . , , . , , .

In addition to rewriting the core logic, I decided to use the built ArraySegment<>-in built -in infrastructure instead of your custom implementation. The only significant difference is that it ArraySegment<>provides a property Countinstead of a property Length. The code below does not require an unsafe keyword because I do not use pointers, but it seems to be a slight performance improvement if it is modified to do this.


    public static ArraySegment<byte>[] SplitEx(this byte[] source, byte value) {
        var _previousIndex = -1;
        var _segments = new List<ArraySegment<byte>>();
        var _length = source.Length;

        if (_length > 0) {
            int _index;

            for (_index = 0; _index < _length; _index++) {
                var _value = source[_index];
                if (_value == value) {
                    _segments.Add(new ArraySegment<byte>(source, _previousIndex + 1, _index - _previousIndex));
                    _previousIndex = _index;
                }
            }

            if (--_index != _previousIndex) {
                _segments.Add(new ArraySegment<byte>(source, _previousIndex + 1, _index - _previousIndex));
            }
        }

        return _segments.ToArray();
    }
+2
source

All Articles