Beating Decades of Optimized C with 27 Lines of Ocaml:
type t = { mutable words: int; mutable chars: int; mutable lines: int; mutable in_word: bool ; mutable finished: bool};;
let () =
match Core.Sys.argv with
| [| prog_name |] -> Core.Printf.eprintf "Usage: %s file1 file2 ...\n" prog_name
| _ -> (
let args = Core.Array.slice Core.Sys.argv 1 @@ Core.Array.length Core.Sys.argv in
let buf_size = 65536 in (* 64 KB -> Caml IO buffer size *)
let buf = Core.Bytes.create buf_size in
Core.Array.fold args ~init:() ~f:(fun _ file ->
Core.In_channel.with_file file ~f:(fun in_ch ->
let c = { words = 0; chars = 0; lines = 0; in_word = false; finished = false } in
let set_words () = if c.in_word then c.words <- c.words + 1 in
while not c.finished do
let len = Core.In_channel.input in_ch ~buf ~pos:0 ~len:buf_size |> Core.Int.to_int in
if len > 0 then (
for i = 0 to (len - 1) do
match (Core.Caml.Bytes.get buf i) with
| ' ' | '\t' | '\r' -> (c.chars <- c.chars + 1; set_words (); c.in_word <- false)
| '\n' -> (c.chars <- c.chars + 1; set_words (); c.in_word <- false; c.lines <- c.lines + 1)
| _ -> (c.chars <- c.chars + 1; c.in_word <- true)
done
) else ( c.finished <- true )
done;
set_words ();
Core.Printf.printf "%s -> lines: %d, words: %d, characters: %d\n" file c.lines c.words c.chars)))
;;
Testing:
$ ls -lh test_file.txt
-rw-r--r-- 1 user group 508M Oct 16 12:01 test_file.txt
$ time wc test_file.txt
4863460 54621760 532480000 test_file.txt
real 0m3.368s
user 0m3.137s
sys 0m0.195s
#Ocaml version:
$ time ./wcl.native test_file.txt
test_file.txt -> lines: 4863460, words: 54621760, characters: 532480000
real 0m2.480s
user 0m2.345s
sys 0m0.123s
While I wouldn't be surpised that OCaml is fast, I'd like to be sure your disk cache was warm in both cases. I'd test it myself, but I don't have OCaml setup on my work computer.
I setup it on my computer, used it to count the lines of 250 files, it is indeed faster than my system's (ubuntu 18.04) wc, but it doesn't compute the total. I don't think this should change the timings much though.
A best of five run of the system's wc takes 0.042s, vs 0.028s for the OCaml version.
I was going to say that it is unreadable, after spending a bit trying to understand it, (for someone that knows no ocaml) it actually is not. Only things I'm unsure are the ~ symbol and <-. What do they do?
edit: <- is just mutating assignment. I thought that ~ introduced named arguments but doesn't seem right.
Not really, the initialization of the JVM is fast(er), but initializing the Clojure runtime took a lot of time. At least in 1.7, not sure if it got better in the meantime.
I could start and run Java programs way faster than just starting the Clojure REPL.
I wonder if most of these older standards could be replaced by new standards that use SQLite as the Application File Format [0], making life easier for everyone.
open Core
let () =
if Array.length Sys.argv <> 5 then
Printf.eprintf "Invalid args\nUsage: %s <input.csv> <column-name> <replacement-string> <output.csv>" Sys.argv.(0)
else
let input_file, column_name, replacement, output_file = Sys.argv.(1), Sys.argv.(2),Sys.argv.(3),Sys.argv.(4) in
In_channel.with_file input_file ~f:(fun in_ch ->
match In_channel.input_line in_ch with
| None -> Printf.eprintf "Cannot read first line"
| Some first_line ->
let columns = String.split first_line ~on:',' in
match List.findi columns ~f:(fun _ col -> col = column_name) with
| None -> Printf.eprintf "Cannot find column: %s" column_name
| Some (index, _) ->
let _ = Out_channel.with_file output_file ~f:(fun out_ch ->
Out_channel.output_string out_ch (first_line ^ "\n");
In_channel.fold_lines in_ch ~init:() ~f:(fun _ line ->
let in_columns = String.split line ~on:',' in
let out_columns = List.mapi ~f:(fun i col -> if i = index then replacement else col) in_columns in
let out_line = (String.concat ~sep:"," out_columns) ^ "\n" in
Out_channel.output_string out_ch out_line
)
) in ())
https://pastebin.com/xt3Wddtu
Output:
https://pastebin.com/KwSTiN8L