I found out myself: Julia can mutate a single temp buffer across all threads, and implementing Sedgewicks nocopy sort (with ping-pong merge) is faster, particularly when passing down preallocated buffer memory to the single threaded sorting Routine once N <= BIG_THRESHOLD. The following are measurements run with JULIA_NUM_THREADS=4 on a Intel(R) Core(TM) i5-4300M CPU @ 2.60GHz, 2594 MHz, 2 physical cores, 4 logical cores, replication code follows. Note that not only runtime of nocopy mergesort ist better, it also causes less garbage collection time. Conclusion: it would make sense for Julia's sorting interface to allow passing in preallocated buffer memory and to use nocopy mergesort behind the curtain. Of course true parallel sorting also requires a parallel merge.
Best
julia> # 4 threads, mergesort, dynamic allocation
julia> @time for i=1:100 c = copy(a); psort!(c); end;
166.270792 seconds (756.58 k allocations: 89.470 GiB, 15.34% gc time)
julia> # 4 threads, mergesort, preallocation per thread, re-allocation in single threaded subroutine
julia> @time for i=1:100 c = copy(a); tsort!(c); end;
143.973485 seconds (658.00 k allocations: 36.972 GiB, 5.89% gc time)
julia> # 4 threads, nocopy mergesort, one preallocation, re-allocation in single threaded subroutine
julia> @time for i=1:100 c = copy(a); ksort!(c); end;
129.715467 seconds (654.51 k allocations: 37.312 GiB, 6.43% gc time)
julia> # 4 threads, nocopy mergesort, one preallocation reused in single threaded subroutine
julia> @time for i=1:100 c = copy(a); jsort!(c); end;
119.139551 seconds (449.91 k allocations: 29.846 GiB, 5.14% gc time)
import Base.Threads.@spawn
const SMALL_THRESHOLD = 2^6
const BIG_THRESHOLD = 2^16
# insertionsort
function isort!(v, lo::Int=1, hi::Int=length(v))
@inbounds for i = lo+1:hi
j = i
x = v[i]
while j > lo
if x < v[j-1]
v[j] = v[j-1]
j -= 1
continue
end
break
end
v[j] = x
end
return v
end
# single threaded merge (from t to v)
function nmerge!(v, lo::Int, mid::Int, hi::Int, t)
i, k, j = lo, lo, mid+1
@inbounds while i <= mid && j <= hi
if t[j] < t[i]
v[k] = t[j]
j += 1
else
v[k] = t[i]
i += 1
end
k += 1
end
@inbounds while i <= mid
v[k] = t[i]
k += 1
i += 1
end
@inbounds while j <= hi
v[k] = t[j]
k += 1
j += 1
end
return v
end
# single threaded nocopy mergesort, one preallocation
function nsort!(v, lo::Int, hi::Int, t)
@inbounds if hi-lo <= SMALL_THRESHOLD
return isort!(v, lo, hi)
end
mid = (lo+hi)>>>1
nsort!(t, lo, mid, v)
nsort!(t, mid+1, hi, v)
nmerge!(v, lo, mid, hi, t)
return v
end
function nsort!(v, lo::Int=1, hi::Int=length(v))
t = copy(v)
nsort!(v,lo,hi,t)
return v
end
# multithreaded mergesort, dynamic allocation
function psort!(v, lo::Int=1, hi::Int=length(v))
@inbounds if hi-lo <= BIG_THRESHOLD
sort!(view(v, lo:hi), alg = MergeSort)
return v
end
mid = (lo+hi)>>>1 # find the midpoint
half = @spawn psort!(v, lo, mid) # task to sort the lower half; will run
psort!(v, mid+1, hi) # in parallel with the current call sorting
# the upper half
wait(half) # wait for the lower half to finish
temp = v[lo:mid] # workspace for merging
i, k, j = 1, lo, mid+1 # merge the two sorted sub-arrays
@inbounds while k < j <= hi
if v[j] < temp[i]
v[k] = v[j]
j += 1
else
v[k] = temp[i]
i += 1
end
k += 1
end
@inbounds while k < j
v[k] = temp[i]
k += 1
i += 1
end
return v
end
# mergesort, preallocation per thread, re-allocation in single threaded subroutine
function tsort!(v, lo::Int=1, hi::Int=length(v), temps=[similar(v, 0) for i = 1:Threads.nthreads()])
if hi - lo < BIG_THRESHOLD # below some cutoff, run in serial
sort!(view(v, lo:hi), alg = MergeSort)
return v
end
mid = (lo+hi)>>>1 # find the midpoint
half = @spawn tsort!(v, lo, mid, temps) # task to sort the lower half; will run
tsort!(v, mid+1, hi, temps) # in parallel with the current call sorting
# the upper half
wait(half) # wait for the lower half to finish
temp = temps[Threads.threadid()] # workspace for merging
length(temp) < mid-lo+1 && resize!(temp, mid-lo+1)
copyto!(temp, 1, v, lo, mid-lo+1)
i, k, j = 1, lo, mid+1 # merge the two sorted sub-arrays
@inbounds while k < j <= hi
if v[j] < temp[i]
v[k] = v[j]
j += 1
else
v[k] = temp[i]
i += 1
end
k += 1
end
@inbounds while k < j
v[k] = temp[i]
k += 1
i += 1
end
return v
end
# multithreaded nocopy mergesort, one preallocation, re-allocation in single threaded subroutine
function ksort!(v, lo::Int, hi::Int, t)
if hi - lo < BIG_THRESHOLD # below some cutoff, run in serial
sort!(view(v, lo:hi), alg = MergeSort)
return v
end
mid = (lo+hi)>>>1 # find the midpoint
half = @spawn ksort!(t, lo, mid, v) # task to sort the lower half; will run
ksort!(t, mid+1, hi, v) # in parallel with the current call sorting
wait(half) # wait for the lower half to finish
nmerge!(v, lo, mid, hi, t)
return v
end
function ksort!(v, lo::Int=1, hi::Int=length(v))
t = copy(v)
ksort!(v,lo,hi,t)
return v
end
# multithreaded nocopy mergesort, one preallocation reused in single threaded subroutine
function jsort!(v, lo::Int, hi::Int, t)
if hi - lo < BIG_THRESHOLD # below some cutoff, run in serial
return nsort!(v, lo, hi, t)
end
mid = (lo+hi)>>>1 # find the midpoint
half = @spawn jsort!(t, lo, mid, v) # task to sort the lower half; will run
jsort!(t, mid+1, hi, v) # in parallel with the current call sorting
wait(half) # wait for the lower half to finish
nmerge!(v, lo, mid, hi, t)
return v
end
function jsort!(v, lo::Int=1, hi::Int=length(v))
t = copy(v)
jsort!(v,lo,hi,t)
return v
end
a = rand(2000);
b = copy(a); @time sort!(b, alg = MergeSort); # reference
c = copy(a); @time isort!(c); # insertionsort
all(b==c) # regression test
a = rand(20000000);
b = copy(a); @time sort!(b, alg = MergeSort); # single-threaded system mergesort used by psort and tsort
c = copy(a); @time nsort!(c); # single threaded nocopy mergesort, one preallocation
all(b==c) # regression test
c = copy(a); @time psort!(c); # 4 threads, mergesort, dynamic allocation
all(b==c) # regression test
c = copy(a); @time tsort!(c); # 4 threads, mergesort, preallocation per thread, re-allocation in single threaded subroutine
all(b==c) # regression test
c = copy(a); @time ksort!(c); # 4 threads, nocopy mergesort, one preallocation, re-allocation in single threaded subroutine
all(b==c) # regression test
c = copy(a); @time jsort!(c); # 4 threads, nocopy mergesort, one preallocation reused in single threaded subroutine
all(b==c) # regression test
# 4 threads, mergesort, dynamic allocation
@time for i=1:100 c = copy(a); psort!(c); end;
# 4 threads, mergesort, preallocation per thread, re-allocation in single threaded subroutine
@time for i=1:100 c = copy(a); tsort!(c); end;
# 4 threads, nocopy mergesort, one preallocation, re-allocation in single threaded subroutine
@time for i=1:100 c = copy(a); ksort!(c); end;
# 4 threads, nocopy mergesort, one preallocation reused in single threaded subroutine
@time for i=1:100 c = copy(a); jsort!(c); end;
I believe the `psort!` code that was shown in the blog post was a 'toy example', meant to demonstrate how to use different parallel techniques, one of them being the use of thread-local state.
There's probably going to be a lot of development of parallelized code going forward. Maybe you could help? :)
Providing Julia Code for in-memory sorting, i.e. a binary mergesort with parallel branches and parallel merge plus a specialized sort for instable inplace sorting (better Quicksort, parallel branches only) is not a huge step from the code above: I could contribute that. If however, the expectation would be that the sort is more general, i.e. k-ary and performs for data actually being on ssd or even on disk (JuliaDB?) thats a complete different story: one would need to experiment with much more complicated algorithms such as cache-oblivious funnel-sort, parallel superscalar samplesort, real disk sorts ...
I don't have time for that. How would I avoid duplicating the efforts of others?
I found out myself: Julia can mutate a single temp buffer across all threads, and implementing Sedgewicks nocopy sort (with ping-pong merge) is faster, particularly when passing down preallocated buffer memory to the single threaded sorting Routine once N <= BIG_THRESHOLD. The following are measurements run with JULIA_NUM_THREADS=4 on a Intel(R) Core(TM) i5-4300M CPU @ 2.60GHz, 2594 MHz, 2 physical cores, 4 logical cores, replication code follows. Note that not only runtime of nocopy mergesort ist better, it also causes less garbage collection time. Conclusion: it would make sense for Julia's sorting interface to allow passing in preallocated buffer memory and to use nocopy mergesort behind the curtain. Of course true parallel sorting also requires a parallel merge.
Best