Maybe I misunderstand, but: In R allocating and copying huge return vectors for ...

joehl · on Aug 1, 2019

OK,

I found out myself: Julia can mutate a single temp buffer across all threads, and implementing Sedgewicks nocopy sort (with ping-pong merge) is faster, particularly when passing down preallocated buffer memory to the single threaded sorting Routine once N <= BIG_THRESHOLD. The following are measurements run with JULIA_NUM_THREADS=4 on a Intel(R) Core(TM) i5-4300M CPU @ 2.60GHz, 2594 MHz, 2 physical cores, 4 logical cores, replication code follows. Note that not only runtime of nocopy mergesort ist better, it also causes less garbage collection time. Conclusion: it would make sense for Julia's sorting interface to allow passing in preallocated buffer memory and to use nocopy mergesort behind the curtain. Of course true parallel sorting also requires a parallel merge.

Best

  julia>   # 4 threads, mergesort, dynamic allocation
  julia>   @time for i=1:100 c = copy(a); psort!(c); end;
  166.270792 seconds (756.58 k allocations: 89.470 GiB, 15.34% gc time)

  julia>   # 4 threads, mergesort, preallocation per thread, re-allocation in single threaded subroutine
  julia>   @time for i=1:100 c = copy(a); tsort!(c); end;
  143.973485 seconds (658.00 k allocations: 36.972 GiB, 5.89% gc time)
  
  julia>   # 4 threads, nocopy mergesort, one preallocation, re-allocation in single threaded subroutine
  julia>   @time for i=1:100 c = copy(a); ksort!(c); end;
  129.715467 seconds (654.51 k allocations: 37.312 GiB, 6.43% gc time)
  
  julia>   # 4 threads, nocopy mergesort, one preallocation reused in single threaded subroutine
  julia>   @time for i=1:100 c = copy(a); jsort!(c); end;
  119.139551 seconds (449.91 k allocations: 29.846 GiB, 5.14% gc time)



  import Base.Threads.@spawn

  const SMALL_THRESHOLD  = 2^6
  const BIG_THRESHOLD  = 2^16

  # insertionsort
  function isort!(v, lo::Int=1, hi::Int=length(v))
      @inbounds for i = lo+1:hi
          j = i
          x = v[i]
          while j > lo
              if x < v[j-1]
                  v[j] = v[j-1]
                  j -= 1
                  continue
              end
              break
          end
          v[j] = x
      end
      return v
  end

  # single threaded merge (from t to v)
  function nmerge!(v, lo::Int, mid::Int, hi::Int, t)
      i, k, j = lo, lo, mid+1
      @inbounds while i <= mid && j <= hi
          if t[j] < t[i]
              v[k] = t[j]
              j += 1
          else
              v[k] = t[i]
              i += 1
          end
          k += 1
      end
      @inbounds while i <= mid
          v[k] = t[i]
          k += 1
          i += 1
      end
      @inbounds while j <= hi
          v[k] = t[j]
          k += 1
          j += 1
      end
      return v
  end

  # single threaded nocopy mergesort, one preallocation
  function nsort!(v, lo::Int, hi::Int, t)
      @inbounds if hi-lo <= SMALL_THRESHOLD 
        return isort!(v, lo, hi)
      end
      mid = (lo+hi)>>>1
      nsort!(t, lo, mid, v)  
      nsort!(t, mid+1, hi, v)
      nmerge!(v, lo, mid, hi, t)
      return v
  end
  function nsort!(v, lo::Int=1, hi::Int=length(v))
      t = copy(v)
      nsort!(v,lo,hi,t)
      return v
  end
    
    
  # multithreaded mergesort, dynamic allocation
  function psort!(v, lo::Int=1, hi::Int=length(v))
      @inbounds if hi-lo <= BIG_THRESHOLD
          sort!(view(v, lo:hi), alg = MergeSort)
          return v
      end

      mid = (lo+hi)>>>1                 # find the midpoint

      half = @spawn psort!(v, lo, mid)  # task to sort the lower half; will run
      psort!(v, mid+1, hi)              # in parallel with the current call sorting

                                        # the upper half
      wait(half)                        # wait for the lower half to finish

      temp = v[lo:mid]                  # workspace for merging

      i, k, j = 1, lo, mid+1            # merge the two sorted sub-arrays
      @inbounds while k < j <= hi
          if v[j] < temp[i]
              v[k] = v[j]
              j += 1
          else
              v[k] = temp[i]
              i += 1
          end
          k += 1
      end
      @inbounds while k < j
          v[k] = temp[i]
          k += 1
          i += 1
      end
      
      return v
  end



  # mergesort, preallocation per thread, re-allocation in single threaded subroutine
  function tsort!(v, lo::Int=1, hi::Int=length(v), temps=[similar(v, 0) for i = 1:Threads.nthreads()])
      if hi - lo < BIG_THRESHOLD               # below some cutoff, run in serial
          sort!(view(v, lo:hi), alg = MergeSort)
          return v
      end

      mid = (lo+hi)>>>1                 # find the midpoint

      half = @spawn tsort!(v, lo, mid, temps)  # task to sort the lower half; will run
      tsort!(v, mid+1, hi, temps)              # in parallel with the current call sorting
                                        # the upper half
      wait(half)                        # wait for the lower half to finish

      temp = temps[Threads.threadid()]                  # workspace for merging
      length(temp) < mid-lo+1 && resize!(temp, mid-lo+1)
      copyto!(temp, 1, v, lo, mid-lo+1)                  

      i, k, j = 1, lo, mid+1            # merge the two sorted sub-arrays
      @inbounds while k < j <= hi
          if v[j] < temp[i]
              v[k] = v[j]
              j += 1
          else
              v[k] = temp[i]
              i += 1
          end
          k += 1
      end
      @inbounds while k < j
          v[k] = temp[i]
          k += 1
          i += 1
      end
      
      return v
  end

  # multithreaded nocopy mergesort, one preallocation, re-allocation in single threaded subroutine
  function ksort!(v, lo::Int, hi::Int, t)
      if hi - lo < BIG_THRESHOLD               # below some cutoff, run in serial
          sort!(view(v, lo:hi), alg = MergeSort)
          return v
      end
      mid = (lo+hi)>>>1                 # find the midpoint
      half = @spawn ksort!(t, lo, mid, v)  # task to sort the lower half; will run
      ksort!(t, mid+1, hi, v)              # in parallel with the current call sorting
      wait(half)                           # wait for the lower half to finish
      nmerge!(v, lo, mid, hi, t)
      return v
  end
  function ksort!(v, lo::Int=1, hi::Int=length(v))
      t = copy(v)
      ksort!(v,lo,hi,t)
      return v
  end
  
  # multithreaded nocopy mergesort, one preallocation reused in single threaded subroutine
  function jsort!(v, lo::Int, hi::Int, t)
      if hi - lo < BIG_THRESHOLD               # below some cutoff, run in serial
          return nsort!(v, lo, hi, t)
      end
      mid = (lo+hi)>>>1                 # find the midpoint
      half = @spawn jsort!(t, lo, mid, v)  # task to sort the lower half; will run
      jsort!(t, mid+1, hi, v)              # in parallel with the current call sorting
      wait(half)                           # wait for the lower half to finish
      nmerge!(v, lo, mid, hi, t)
      return v
  end
  function jsort!(v, lo::Int=1, hi::Int=length(v))
      t = copy(v)
      jsort!(v,lo,hi,t)
      return v
  end


  a = rand(2000);
  b = copy(a); @time sort!(b, alg = MergeSort);   # reference
  c = copy(a); @time isort!(c);                   # insertionsort
  all(b==c) # regression test

  a = rand(20000000);
  b = copy(a); @time sort!(b, alg = MergeSort);   # single-threaded system mergesort used by psort and tsort
  c = copy(a); @time nsort!(c);    # single threaded nocopy mergesort, one preallocation
  all(b==c) # regression test
  c = copy(a); @time psort!(c);    # 4 threads, mergesort, dynamic allocation
  all(b==c) # regression test
  c = copy(a); @time tsort!(c);    # 4 threads, mergesort, preallocation per thread, re-allocation in single threaded subroutine
  all(b==c) # regression test
  c = copy(a); @time ksort!(c);    # 4 threads, nocopy mergesort, one preallocation, re-allocation in single threaded subroutine
  all(b==c) # regression test
  c = copy(a); @time jsort!(c);    # 4 threads, nocopy mergesort, one preallocation reused in single threaded subroutine
  all(b==c) # regression test

  # 4 threads, mergesort, dynamic allocation
  @time for i=1:100 c = copy(a); psort!(c); end;
  # 4 threads, mergesort, preallocation per thread, re-allocation in single threaded subroutine
  @time for i=1:100 c = copy(a); tsort!(c); end;
  # 4 threads, nocopy mergesort, one preallocation, re-allocation in single threaded subroutine
  @time for i=1:100 c = copy(a); ksort!(c); end;
  # 4 threads, nocopy mergesort, one preallocation reused in single threaded subroutine
  @time for i=1:100 c = copy(a); jsort!(c); end;

DNF2 · on Aug 2, 2019

I believe the `psort!` code that was shown in the blog post was a 'toy example', meant to demonstrate how to use different parallel techniques, one of them being the use of thread-local state.

There's probably going to be a lot of development of parallelized code going forward. Maybe you could help? :)

joehl · on Aug 5, 2019

Providing Julia Code for in-memory sorting, i.e. a binary mergesort with parallel branches and parallel merge plus a specialized sort for instable inplace sorting (better Quicksort, parallel branches only) is not a huge step from the code above: I could contribute that. If however, the expectation would be that the sort is more general, i.e. k-ary and performs for data actually being on ssd or even on disk (JuliaDB?) thats a complete different story: one would need to experiment with much more complicated algorithms such as cache-oblivious funnel-sort, parallel superscalar samplesort, real disk sorts ... I don't have time for that. How would I avoid duplicating the efforts of others?