Discussion:
Speed wars 3: OCaml, F#, Clojure, Pascal
(too old to reply)
unknown
2009-02-18 16:04:36 UTC
Permalink
Pentium 4, 3.2 GHz
Windows XP

2.475 sec. -- F#
3.276 sec. -- Clojure (java -server)
5.921 sec. -- OCaml (ocamlopt)
5.156 sec. -- FreePascal

I included Pascal in order to show how the functional languages
compare to an imperative one.

It's amazing that F# and Clojure beat the machine language
generated by FreePascal.



F# -------------------------------------------------------------

let runs = 100
let max_iterations = 1000

let iterate ci cr =
let rec loop zi zr i =
if i <= max_iterations then
let
zr2 = zr * zr and
zi2 = zi * zi in
if zi2 + zr2 <= 4.0 then
loop ( zr * zi * 2.0 + ci) (zr2 - zi2 + cr) (i + 1)
else
i
else
0
in
loop 0.0 0.0 1

let mandelbrot n =
for y = -39 to 38 do
if 1 = n then printf "\n";
for x = -39 to 38 do
let i =
iterate (float x / 40.0) (float y / 40.0 - 0.5) in
if 1 = n then
printf "%s" ( if 0 = i then "*" else " " );
done
done;;

let timer = System.Diagnostics.Stopwatch.StartNew () in
for i = 1 to runs do
mandelbrot i
done;
printf "\n%d\n" timer.ElapsedMilliseconds;



Clojure -----------------------------------------------------

;; malkia's version

(def runs 100)
(def max_iterations 1000)

(defn iter [ci cr]
(let [max_iter (int max_iterations)
ci (double ci)
cr (double cr)]
(loop [zi (double ci) zr (double cr) i (int 1)]
(if (<= i max_iter)
(let [zr2 (* zr zr) zi2 (* zi zi)]
(if (<= (+ zr2 zi2) (double 4.0))
(recur (+ (* (* zr zi) (double 2.0)) ci)
(+ (- zr2 zi2) cr)
(inc i))
i))
0))))

(defn mand [n]
(doseq [y (range -39 39)]
(when (= 1 n) (print "\n"))
(doseq [x (range -39 39)]
(let [i (iter (/ x 40.0) (- (/ y 40.0) 0.5))]
(when (= 1 n) (print (if (= 0 i) "*" " ")))))))

(defn time-mand []
(time
(doseq [i (range 1 (inc runs))]
(mand i))))

(time-mand)



OCaml ---------------------------------------------

let runs = 100
let max_iterations = 1000

let iterate ci cr =
let bailout = 4.0 in
let rec loop zi zr i =
if i > max_iterations then
0
else
let temp = zr *. zi and
zr2 = zr *. zr and
zi2 = zi *. zi in
if zi2 +. zr2 > bailout then
i
else
loop (temp +. temp +. ci) (zr2 -. zi2 +. cr) (i + 1)
in
loop 0.0 0.0 1

let mandelbrot n =
for y = -39 to 38 do
if 1 = n then print_endline "" else ();
for x = -39 to 38 do
let i = iterate
(float x /. 40.0) (float y /. 40.0 -. 0.5) in
if 1 = n then
print_string ( if 0 = i then "*" else " " )
else ();
done
done;;


let start_time = Unix.gettimeofday () in
for iter = 1 to runs do
mandelbrot iter
done;
print_endline "";
print_float ( Unix.gettimeofday () -. start_time );
print_endline "";




FreePascal ----------------------------------------------------

uses sysutils; { for timing }

const
runs = 100;
max_iterations = 1000;

function iterate( ci, cr: real): longint;
var
count : longint;
zr, zi, zr2, zi2 : real;
begin
count := 1;
zr := 0.0; zi := 0.0; zr2 := 0.0; zi2 := 0.0;
while (count <= max_iterations) and (zr2 + zi2 < 4.0) do
begin
zi := zr * zi * 2.0 + ci;
zr := zr2 - zi2 + cr;
inc( count );
zr2 := zr * zr;
zi2 := zi * zi
end;
if count > max_iterations then
exit( 0 )
else
exit( count )
end;

procedure mandelbrot( n: longint );
var
y, x, i : longint;
begin
for y := -39 to 38 do
begin
if 1 = n then writeln;
for x := -39 to 38 do
begin
i := iterate( (x / 40.0), ( y / 40.0 - 0.5) );
if 1 = n then
if 0 = i then write( '*' ) else write( ' ' );
end
end
end;

var
when : tDateTime;
iter : longint;
begin
when := Time;
for iter := 1 to runs do
mandelbrot( iter );
writeln;
writeln( ((time - when) * secsPerDay):0:3, ' seconds' )
end.
Alia K
2009-02-19 10:25:25 UTC
Permalink
I wonder how haskell would do in this (-:

AK
Michael Oswald
2009-02-19 12:25:04 UTC
Permalink
Ok, here is my first attempt (and beware that I am a Haskell newbie and
possibly a more experienced programmer would write it differently):


Reference: converted the Pascal Program from William to C++:

(gcc 3.3.3 with -O3):
./time Mandelbrot_cpp:
2.595u 0.004s 0:03.81 67.9% 0+0k 0+0io 0pf+0w

(ocamlopt 3.11.0):
./time Mandelbrot_ocaml:
12.404u 0.025s 0:17.69 70.2% 0+0k 0+0io 0pf+0w

(ghc 6.10.1 with -O2):
./time Mandelbrot_haskell:
0.321u 0.001s 0:00.74 43.2% 0+0k 0+0io 0pf+0w

BTW, I used the time command to get the various timings for the programs
because the different APIs return times with different meanings on my
machine.


Because Haskell is soo much faster than C++, I am clearly running into a
lazyness issue in that Haskell doesn't calculate the whole range.

If somebody knows how to correct the Haskell program to evaluate the
whole range, feel free to do so.


Haskell:

module Main
where


import System.Time


runs :: Int
runs = 100

max_iterations :: Int
max_iterations = 1000


iterate :: Double -> Double -> Int
iterate ci cr =
let bailout = 4.0
loop zi zr i =
if i > max_iterations then
0
else
let temp = zr * zi
zr2 = zr * zr
zi2 = zi * zi in
if zi2 + zr2 > bailout then
i
else
loop (temp + temp + ci) (zr2 - zi2 + cr) (i + 1)
in
loop 0.0 0.0 1


mandelbrot n = do
let y = [-39..38]
x = [-39..38]

iter y x = do
let res = Main.iterate ((fromIntegral x) / 40.0)
((fromIntegral y) / 40.0 - 0.5)
if n == 1 then
if res == 0 then putChar '*' else putChar ' '
else return ()

inner y = do
mapM_ (iter y) x
if n == 1 then putChar '\n' else return ()
outer = mapM_ (\i -> inner i) y
outer



main = do
let iter = [1..runs]

startTime <- getClockTime

mapM_ mandelbrot iter

endTime <- getClockTime

let diff = show (diffClockTimes endTime startTime)
putStrLn $ "Time: " ++ diff




C++:

#include <iostream>
#include <time.h>


static const int runs = 100;
static const int max_iterations = 1000;

long iterate(double ci, double cr)
{
long count = 1;
double zr = 0.0;
double zi = 0.0;
double zr2 = 0.0;
double zi2 = 0.0;

while((count <= max_iterations) && (zr2 + zi2 < 4.0))
{
zi = zr * zi * 2.0 + ci;
zr = zr2 - zi2 + cr;
++count;
zr2 = zr * zr;
zi2 = zi * zi;
}
if(count > max_iterations)
{
return 0;
}
else
{
return count;
}
}

void mandelbrot(long n)
{
long i;
for(int y = -39; y <= 38; ++y)
{
if(n == 1)
{
std::cout << '\n';
}
for(int x = -39; x <= 38; ++x)
{
i = iterate(x / 40.0, y / 40.0 -0.5);
if(n == 1)
{
if(i == 0) std::cout << '*';
else std::cout << ' ';
}
}
}
}



int main()
{
clock_t start, end;
double cpu_time_used;

start = clock();
for(long iter = 1; iter<= runs; ++iter)
{
mandelbrot(iter);
}
end = clock();
cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;

std::cout.precision(6);
std::cout << cpu_time_used << " seconds" << std::endl;

}



lg,
Michael
Jon Harrop
2009-02-19 17:28:08 UTC
Permalink
Post by Michael Oswald
If somebody knows how to correct the Haskell program to evaluate the
whole range, feel free to do so.
Better to correct the benchmark itself, which is performing many redundant
computations. Get rid of that loop over 100 runs (just set runs=1) and just
increase the max iterations to 65536 instead.

I get:

32-bit
GHC: 9.639s
OCaml: 5.516s

64-bit
GHC: 6.725s
OCaml: 1.318s
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
unknown
2009-02-19 18:23:56 UTC
Permalink
Post by Jon Harrop
Post by Michael Oswald
If somebody knows how to correct the Haskell program to
evaluate the whole range, feel free to do so.
Better to correct the benchmark itself, which is performing
many redundant computations. Get rid of that loop over 100
runs (just set runs=1) and just increase the max iterations
to 65536 instead.
32-bit
GHC: 9.639s
OCaml: 5.516s
64-bit
GHC: 6.725s
OCaml: 1.318s
32-bit (3.2 GHz Pentium 4)
Clojure: 2.015 sec.


; using the version of the Clojure program without pmap
(def runs 1)
(def max_iterations 65536)
Jon Harrop
2009-02-19 18:43:31 UTC
Permalink
Post by unknown
Post by Jon Harrop
Post by Michael Oswald
If somebody knows how to correct the Haskell program to
evaluate the whole range, feel free to do so.
Better to correct the benchmark itself, which is performing
many redundant computations. Get rid of that loop over 100
runs (just set runs=1) and just increase the max iterations
to 65536 instead.
32-bit
GHC: 9.639s
OCaml: 5.516s
64-bit
GHC: 6.725s
OCaml: 1.318s
32-bit (3.2 GHz Pentium 4)
Clojure: 2.015 sec.
Wow, that's embarrassingly slow. ;-)

Here is a parallel OCaml implementation that takes 0.34s on my 8 core:

let max_iterations = 65536

let iterate ci cr =
let zr = ref 0.0 in
let zi = ref 0.0 in
let i = ref 1 in
try
while true do
if !i > max_iterations then raise Exit;
let temp = !zr *. !zi and zr2 = !zr *. !zr and zi2 = !zi *. !zi in
if zr2 +. zi2 > 4.0 then raise Exit;
zr := zr2 -. zi2 +. cr;
zi := temp +. temp +. ci;
incr i
done;
0
with Exit ->
if !i > max_iterations then 0 else !i

let row y =
let s = String.make 78 ' ' in
for j = 0 to 77 do
let x = j - 39 in
let i = iterate
(float x /. 40.0) (float y /. 40.0 -. 0.5) in
if i=0 then s.[j] <- '*'
done;
s

let invoke (f : 'a -> 'b) x : unit -> 'b =
let input, output = Unix.pipe() in
match Unix.fork() with
| -1 -> (let v = f x in fun () -> v)
| 0 ->
Unix.close input;
let output = Unix.out_channel_of_descr output in
Marshal.to_channel output (try `Res(f x) with e -> `Exn e) [];
close_out output;
exit 0
| pid ->
Unix.close output;
let input = Unix.in_channel_of_descr input in
fun () ->
let v = Marshal.from_channel input in
ignore (Unix.waitpid [] pid);
close_in input;
match v with
| `Res x -> x
| `Exn e -> raise e

let () =
Array.iter (fun f -> print_endline(f()))
(Array.init 78 (fun i -> invoke row (i-39)))
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
Florian Kreidler
2009-02-19 19:53:22 UTC
Permalink
That looks quite verbose. :)

This Haskell program also calculates the lines in parallel:

---------------------------
module Main(main) where

import Control.Parallel.Strategies

max_iterations :: Int
max_iterations = 65536

iterate :: Double -> Double -> Char
iterate ci cr = loop 0.0 0.0 1
where
loop zi zr i
| i > max_iterations = '*'
| zi2 + zr2 > 4.0 = ' '
| otherwise = loop zi' zr' (i + 1)
where temp = zr * zi
zr2 = zr * zr
zi2 = zi * zi
zi' = temp + temp + ci
zr' = zr2 - zi2 + cr

mandelbrot :: String
mandelbrot = unlines image
where image = parMap rnf line [-39..38] -- (*)
line y = map (pixel y) [-39..38]
pixel y x = Main.iterate (x / 40.0) (y/40.0-0.5)

main :: IO ()
main = putStrLn mandelbrot
---------------------------

Compile with options
-fviaa-C -O2 -fexcess-precision -optc-ffast-math -optc-O3 -threaded
and run with option
+RTS +Nx
where x is a number that is slightly higher than the number of cores.

(*) Function parMap evaluates the list elements in parallel, applying
strategy rnf to each element. Strategy rnf evaluates its argument
completely.
Michael Oswald
2009-02-20 13:48:34 UTC
Permalink
Hm, looks a lot cleaner than my solution, learned some new things, thanks!

Anyway, since the current machine I'm on is no multicore (it doesn't
have even hyperthreading), I edited your example and replaced parMap
with a normal map.

The comparison:

set runs to 1 and max_iterations to 99888 for all programs, then compile
with:

C++:compiled with gcc 3.3.3 with -O3
Ocaml: with ocalopt 3.11.0
Haskell: both with ghc 6.10.1 and -fvia-C -O2 -fexcess-precision
-optc-ffast-math -optc-O3


C++:
2.669u 0.010s 0:02.91 91.7% 0+0k 0+0io 0pf+0w

Ocaml:
11.682u 0.023s 0:12.09 96.7% 0+0k 0+0io 0pf+0w

Haskell (the version I posted above):
16.190u 0.030s 0:18.84 86.0% 0+0k 0+0io 0pf+0w

Haskell (Florians version):
20.034u 0.045s 0:23.20 86.5% 0+0k 0+0io 0pf+0w

So in speed factors:
C++ Ocaml Haskell_1 Haskell_2
1 4.38 6.07 7.5

So my conclusion on this topic is:
C++ is unbeatable in this example. This can be expected, since -O3
performs complete loop unrolling optimization.
Can't say much on OCaml, since I don't know it too well. A lot slower
than C++ but still fast.
For the differences in the Haskell versions I would have to profile for
the difference, so I can only speculate that it has something to do with
the mandelbrot function executing in the IO monad in my version and the
more pure lazy approach from Florian. This could support the statement,
that a more imperative style can lead to a greater execution speed.

But still from a readers point of view (and that is strictly my own
personal opinion), I like Florians version most just because of it's
visual simplicity and conciseness, and if I won't have to go for raw
speed, I would take his solution. But of course I am quite biased here
with Haskell.


Anyway, this was quite interesting.


lg,
Michael
Florian Kreidler
2009-02-20 20:44:17 UTC
Permalink
Post by Michael Oswald
Hm, looks a lot cleaner than my solution, learned some new things, thanks!
Anyway, since the current machine I'm on is no multicore (it doesn't
have even hyperthreading), I edited your example and replaced parMap
with a normal map.
When you compile it without the command line parameter "-threaded", then
parMap and map should behave identical. So, there is no need to write
two different versions for parallel and sequential evaluation.
Post by Michael Oswald
C++ Ocaml Haskell_1 Haskell_2
1 4.38 6.07 7.5
Here is another version that uses unboxed arrays from package uvector:

------------
module Main(main) where

import Control.Parallel.Strategies
import Data.Array.Vector

max_iterations :: Int
max_iterations = 65536

pixel :: Int -> Int -> Char
pixel x y = loop 0.0 0.0 1
where
ci, cr :: Float
ci = fromIntegral y / 40.0
cr = fromIntegral x / 40.0 - 0.5
loop zi zr i
| i > max_iterations = '*'
| zi2 + zr2 > 4.0 = ' '
| otherwise = loop zi' zr' (i + 1)
where temp = zr * zi
zr2 = zr * zr
zi2 = zi * zi
zi' = temp + temp + ci
zr' = zr2 - zi2 + cr

mandelbrot :: String
mandelbrot = unlines $ map fromU image
where image = parMap (`seq` ()) line [-39..38]
line y = mapU (pixel y) $ enumFromToU (-39) 38

main :: IO ()
main = putStrLn mandelbrot
------------

When run single-threaded, it is 30% slower than your C++ version.
With two cores enabled, it is about as fast as C++.
Post by Michael Oswald
C++ is unbeatable in this example. This can be expected, since -O3
performs complete loop unrolling optimization.
Can't say much on OCaml, since I don't know it too well. A lot slower
than C++ but still fast.
Another conclusion: Haskell can outperform OCaml when state-of-the-art
libraries are used. And Haskell is the only language where
parallelisation comes (nearly) for free.
Post by Michael Oswald
For the differences in the Haskell versions I would have to profile for
the difference, so I can only speculate that it has something to do with
the mandelbrot function executing in the IO monad in my version and the
more pure lazy approach from Florian. This could support the statement,
that a more imperative style can lead to a greater execution speed.
The slowdown in my former version seems to come from lazy evaluation.
In the current version, the compiler automatically translates the
expression

mapU (pixel y) $ enumFromToU (-39) 38

into a single loop. That is impossible when your program is written
in imperative style. So, programming in functional higher-order
style and choosing the right data representation allows you to combine
maximum performance with maximum clarity - and makes parallelisation
a no-brainer.
Dan Doel
2009-02-21 09:22:41 UTC
Permalink
Post by Florian Kreidler
The slowdown in my former version seems to come from lazy evaluation.
In the current version, the compiler automatically translates the
expression
mapU (pixel y) $ enumFromToU (-39) 38
into a single loop. That is impossible when your program is written
in imperative style. So, programming in functional higher-order
style and choosing the right data representation allows you to combine
maximum performance with maximum clarity - and makes parallelisation
a no-brainer.
That's probably not a "lazy evaluation" issue, then. It's a fusion issue.
GHC currently uses foldr/build fusion for lists, but that has some
noticeable gaps (for instance, it's no good for fusing left folds). uvector
is built on stream fusion, but there's also a "stream-fusion" package on
hackage that provides stream fusion versions of various functions in
Data.List and Control.Monad.

It's not obvious to me what specifically wouldn't fuse in your original
example, however. But it's possible that using stream-fusion would perform
as well as using uvector.

The one discrepancy is that it doesn't include an enumFromTo, but you can
write it as (for this case):

enumFromTo m n = unfoldr f m
where
f k | k <= n = Just (k,k+1)
| otherwise = Nothing

which should fuse correctly.

Cheers,
-- Dan
Dan Doel
2009-02-21 10:36:25 UTC
Permalink
I did a bit of poking around on my own, and thought I'd give my findings.

First, the uvector version you posted here has a slight difference, in that
it uses Ints in enumFromToU, whereas the list version uses Doubles (because
it's impossible to use Doubles with the UArr version). However, if you
change the list version to use Ints as well, a bunch of other optimizations
trigger, and the code becomes significantly faster. A difference on my machine
of around 15 - 18 vs. 7 - 8 seconds (I don't have a multi-core, so I changed
everything back to regular map, by the way).

A uvector version runs in 5 - 6 seconds here, so the list version is pretty
close. I was unable to make a stream-fusion version go faster than the
built-in list functions (and had problems with specialization of my custom
enumFromTo for some reason), but that's unsurprising because there's nothing
in this program that foldr/build fusion can't handle.

I'm not sure where the extra performance comes from with uvector, but it is
more state-of-the-art than stream-fusion, and its being less beholden to the
exact behavior of the functions from Data.List may give it an edge over the
stream-fusion package (it is not, however, in my case due to the UArr
structure being strict, other than how that affects the definitions of the
functions, because all the intermediate UArrs should be fused away to become
loops).

I'll post my alternate list code below, although it's not significantly
different than your original.

-- Dan

module Main(main) where

import Prelude hiding (iterate)

max_iterations :: Int
max_iterations = 65536

iterate :: Double -> Double -> Char
iterate ci cr = loop 0.0 0.0 1
where
loop zi zr i
| i > max_iterations = '*'
| zi2 + zr2 > 4.0 = ' '
| otherwise = loop zi' zr' (i + 1)
where temp = zr * zi
zr2 = zr * zr
zi2 = zi * zi
zi' = temp + temp + ci
zr' = zr2 - zi2 + cr

mandelbrot :: String
mandelbrot = unlines image
where image = map line [-39..38 :: Int]
line y = map (pixel y) [-39..38 :: Int]
pixel y x = iterate (fromIntegral x / 40.0)
(fromIntegral y/40.0-0.5)

main :: IO ()
main = putStrLn mandelbrot
Michael Oswald
2009-02-20 17:01:44 UTC
Permalink
Post by Dan Doel
I'll post my alternate list code below, although it's not significantly
different than your original.
-- Dan
module Main(main) where
import Prelude hiding (iterate)
max_iterations :: Int
max_iterations = 65536
iterate :: Double -> Double -> Char
iterate ci cr = loop 0.0 0.0 1
where
loop zi zr i
| i > max_iterations = '*'
| zi2 + zr2 > 4.0 = ' '
| otherwise = loop zi' zr' (i + 1)
where temp = zr * zi
zr2 = zr * zr
zi2 = zi * zi
zi' = temp + temp + ci
zr' = zr2 - zi2 + cr
mandelbrot :: String
mandelbrot = unlines image
where image = map line [-39..38 :: Int]
line y = map (pixel y) [-39..38 :: Int]
pixel y x = iterate (fromIntegral x / 40.0)
(fromIntegral y/40.0-0.5)
main :: IO ()
main = putStrLn mandelbrot
Just for the sake of completeness, here my results on another machine:

Note: I changed all max_iterations to 99888.


C++: (gcc 4.3.2 with -O3)
-------------------------
./time Mandelbrot_cpp:
2.544u 0.004s 0:02.57 98.8% 0+0k 0+0io 0pf+0w

OcamL:
------
Williams version:
12.052u 0.004s 0:12.14 99.2% 0+0k 0+0io 0pf+0w

Jons version:
6.108u 0.064s 0:03.30 186.6% 0+0k 0+0io 0pf+0w


Haskell:
--------
mine:
16.125u 0.080s 0:16.48 98.3% 0+0k 0+0io 0pf+0w

Florian:
20.037u 0.188s 0:20.54 98.3% 0+0k 0+0io 0pf+0w

Florian UVector:
4.508u 0.008s 0:04.62 97.4% 0+0k 0+0io 0pf+0w

Dan (Int in lists)
10.756u 0.048s 0:10.87 99.2% 0+0k 0+0io 0pf+0w


lg,
Michael
Jon Harrop
2009-02-21 10:56:56 UTC
Permalink
Post by unknown
------------
module Main(main) where
import Control.Parallel.Strategies
import Data.Array.Vector
max_iterations :: Int
max_iterations = 65536
pixel :: Int -> Int -> Char
pixel x y = loop 0.0 0.0 1
where
ci, cr :: Float
ci = fromIntegral y / 40.0
cr = fromIntegral x / 40.0 - 0.5
loop zi zr i
| i > max_iterations = '*'
| zi2 + zr2 > 4.0 = ' '
| otherwise = loop zi' zr' (i + 1)
where temp = zr * zi
zr2 = zr * zr
zi2 = zi * zi
zi' = temp + temp + ci
zr' = zr2 - zi2 + cr
mandelbrot :: String
mandelbrot = unlines $ map fromU image
where image = parMap (`seq` ()) line [-39..38]
line y = mapU (pixel y) $ enumFromToU (-39) 38
main :: IO ()
main = putStrLn mandelbrot
------------
When run single-threaded, it is 30% slower than your C++ version.
With two cores enabled, it is about as fast as C++.
Again, I cannot compile this program due to missing third-party libraries.
Post by unknown
Post by Michael Oswald
C++ is unbeatable in this example. This can be expected, since -O3
performs complete loop unrolling optimization.
Can't say much on OCaml, since I don't know it too well. A lot slower
than C++ but still fast.
Another conclusion: Haskell can outperform OCaml when state-of-the-art
libraries are used.
Haskell can outperform crippled OCaml, perhaps. My standalone OCaml takes
0.34s here, which is much faster than any of the Haskell implementations
that I can compile and much faster than the performance figures quoted by
you and others for Haskell on any other machine.
Post by unknown
And Haskell is the only language where parallelisation comes (nearly) for
free.
Only if you neglect the effort required to find and install these third
party libraries and suffer command lines so complicated that you yourself
made three errors in quoting them. Even then your "parallel" version is now
running 63x slower than my OCaml and the more cores I use the slower it
gets.

That is not "nearly free" by any stretch of the imagination.
Post by unknown
So, programming in functional higher-order style and choosing the right
data representation allows you to combine maximum performance with maximum
clarity - and makes parallelisation a no-brainer.
Look at the performance of your Haskell on my 8 core:

$ time ./mandelbrot +RTS -N2
real 0m42.273s
user 1m21.169s
sys 0m0.360s
$ time ./mandelbrot +RTS -N3
real 0m28.880s
user 1m19.801s
sys 0m0.564s
$ time ./mandelbrot +RTS -N4
real 0m21.457s
user 1m17.645s
sys 0m0.592s
$ time ./mandelbrot +RTS -N5
real 0m18.068s
user 1m18.905s
sys 0m0.664s
$ time ./mandelbrot +RTS -N6
real 0m16.010s
user 1m21.121s
sys 0m0.736s
$ time ./mandelbrot +RTS -N7
real 0m17.340s
user 1m34.294s
sys 0m0.756s
$ time ./mandelbrot +RTS -N8
real 0m26.079s
user 2m25.685s
sys 0m0.856s

Two points:

1. My OCaml is orders of magnitude faster at 0.34s.

2. The overhead of your approach to parallelism is so inefficient that the
whole program sees performance worsen beyond only 6 cores.

The theoretical points you raise are devoid of merit. Haskell may well be
facilitating many optimizations internally but if the code remains a lot
slower externally, those optimizations are useless. GHC may well make
parallelism easy but if that parallelism fails to give a performance
advantage, it is useless.
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
Florian Kreidler
2009-02-21 11:49:47 UTC
Permalink
Post by Jon Harrop
Post by Florian Kreidler
(...)
When run single-threaded, it is 30% slower than your C++ version.
With two cores enabled, it is about as fast as C++.
Again, I cannot compile this program due to missing third-party libraries.
I wouldn't call Don Stewart a third party. :) The library is available at
Hackage. Just type

cabal install uvector

and it will work fine. This library is part of the current quasi-standard,
so I wonder why you refuse to use it.
Post by Jon Harrop
Post by Florian Kreidler
Post by Michael Oswald
C++ is unbeatable in this example. This can be expected, since -O3
performs complete loop unrolling optimization.
Can't say much on OCaml, since I don't know it too well. A lot slower
than C++ but still fast.
Another conclusion: Haskell can outperform OCaml when state-of-the-art
libraries are used.
Haskell can outperform crippled OCaml, perhaps. My standalone OCaml takes
0.34s here, which is much faster than any of the Haskell implementations
that I can compile and much faster than the performance figures quoted by
you and others for Haskell on any other machine.
1) You choose not to try the fast Haskell versions, and base your
conclusions on that choice.
2) You compare the measurements from some single-core or dual-core processors
to measurements from your 8-core machine.

Both is, at least, questionable.
Post by Jon Harrop
Post by Florian Kreidler
And Haskell is the only language where parallelisation comes (nearly) for
free.
Only if you neglect the effort required to find and install these third
party libraries and suffer command lines so complicated that you yourself
made three errors in quoting them. Even then your "parallel" version is now
running 63x slower than my OCaml and the more cores I use the slower it
gets.
1) I made only one error in the command options that are needed to parallelize
a program. I misquoted the run-time option "-Nx" as "+Nx". The other
requirement is to add "-threaded" to the compiler options, which I was
able to communicate.
(Besides that, I added a superfluous 'a' to the option "-fvia-C". This
option has nothing to do with parallelisation. And you forgot to provide
the command line switch "--make", which should be known to anyone who uses
Haskell)

2) The effort required to install the needed "third party" libraries - of
which one is part of the extralibs package, that is part of the ghc
distribution, and the other is available at the central code repository
hackage.haskell.org - adds up to one command line:
cabal install uvector parallel
Post by Jon Harrop
That is not "nearly free" by any stretch of the imagination.
I was able to parallelize the program by replacing "map" with "parMap rnf".
No need to mess with Unix processes or file handles; you just change one
line of code and add two small command line options. I call that "nearly
free".
Post by Jon Harrop
Post by Florian Kreidler
So, programming in functional higher-order style and choosing the right
data representation allows you to combine maximum performance with maximum
clarity - and makes parallelisation a no-brainer.
$ time ./mandelbrot +RTS -N2
real 0m42.273s
user 1m21.169s
sys 0m0.360s
$ time ./mandelbrot +RTS -N3
real 0m28.880s
user 1m19.801s
sys 0m0.564s
$ time ./mandelbrot +RTS -N4
real 0m21.457s
user 1m17.645s
sys 0m0.592s
$ time ./mandelbrot +RTS -N5
real 0m18.068s
user 1m18.905s
sys 0m0.664s
$ time ./mandelbrot +RTS -N6
real 0m16.010s
user 1m21.121s
sys 0m0.736s
$ time ./mandelbrot +RTS -N7
real 0m17.340s
user 1m34.294s
sys 0m0.756s
$ time ./mandelbrot +RTS -N8
real 0m26.079s
user 2m25.685s
sys 0m0.856s
1. My OCaml is orders of magnitude faster at 0.34s.
2. The overhead of your approach to parallelism is so inefficient that the
whole program sees performance worsen beyond only 6 cores.
Sorry, but I can't reproduce your results. On my computer, the Haskell
program that uses the uvector library is about 20 times faster, and the
OCaml program is about 12 times slower than on yours.
Post by Jon Harrop
The theoretical points you raise are devoid of merit. Haskell may well be
facilitating many optimizations internally but if the code remains a lot
slower externally, those optimizations are useless. GHC may well make
parallelism easy but if that parallelism fails to give a performance
advantage, it is useless.
Right. But that's not the case. At least not on my computer.
Jon Harrop
2009-02-21 17:24:13 UTC
Permalink
Post by Florian Kreidler
Post by Jon Harrop
Post by Florian Kreidler
(...)
When run single-threaded, it is 30% slower than your C++ version.
With two cores enabled, it is about as fast as C++.
Again, I cannot compile this program due to missing third-party libraries.
I wouldn't call Don Stewart a third party. :) The library is available at
Hackage. Just type
cabal install uvector
and it will work fine. This library is part of the current quasi-standard,
so I wonder why you refuse to use it.
Once I'd installed Cabal, parsec, network, HTTP, zlib and cabal-install it
was then a case of:

$ cabal update
$ cabal install uvector
Post by Florian Kreidler
Post by Jon Harrop
Post by Florian Kreidler
Post by Michael Oswald
C++ is unbeatable in this example. This can be expected, since -O3
performs complete loop unrolling optimization.
Can't say much on OCaml, since I don't know it too well. A lot slower
than C++ but still fast.
Another conclusion: Haskell can outperform OCaml when state-of-the-art
libraries are used.
Haskell can outperform crippled OCaml, perhaps. My standalone OCaml takes
0.34s here, which is much faster than any of the Haskell implementations
that I can compile and much faster than the performance figures quoted by
you and others for Haskell on any other machine.
1) You choose not to try the fast Haskell versions...
Now that I have managed to get them to compile it turns out they are still
not faster than my original parallel OCaml:

$ time ./mandelbrot +RTS -N1
real 0m2.022s
user 0m2.016s
sys 0m0.000s
$ time ./mandelbrot +RTS -N2
real 0m1.749s
user 0m3.024s
sys 0m0.000s
$ time ./mandelbrot +RTS -N3
real 0m1.031s
user 0m2.124s
sys 0m0.016s
$ time ./mandelbrot +RTS -N4
real 0m0.942s
user 0m2.844s
sys 0m0.012s
$ time ./mandelbrot +RTS -N5
real 0m0.723s
user 0m2.384s
sys 0m0.052s
$ time ./mandelbrot +RTS -N6
real 0m0.593s
user 0m2.184s
sys 0m0.088s
$ time ./mandelbrot +RTS -N7
real 0m0.551s
user 0m2.348s
sys 0m0.008s
$ time ./mandelbrot +RTS -N8
real 0m0.713s
user 0m2.864s
sys 0m0.072s
$ time ./mandelbrot +RTS -N9
real 0m0.673s
user 0m2.740s
sys 0m0.128s
$ time ./mandelbrot +RTS -N10
real 0m0.496s
user 0m2.320s
sys 0m0.064s
$ time ./mandelbrot +RTS -N11
real 0m0.503s
user 0m2.260s
sys 0m0.076s
$ time ./mandelbrot +RTS -N12
real 0m0.607s
user 0m2.176s
sys 0m0.052s
$ time ./mandelbrot +RTS -N77
real 0m8.001s
user 1m0.580s
sys 0m0.568s

My OCaml is not only faster but it didn't require any of this hand tweaking
that cannot be done on real programs and it scales equivalently to the last
measurement above, where Haskell is 24x slower than OCaml.
Post by Florian Kreidler
2) The effort required to install the needed "third party" libraries - of
which one is part of the extralibs package, that is part of the ghc
distribution, and the other is available at the central code repository
cabal install uvector parallel
No, it added up to a dozen lines of hacking to get the various bits and bobs
from around the internet that you had to depend upon precisely because GHC
does not bundle this functionality just as OCaml does not.
Post by Florian Kreidler
Post by Jon Harrop
That is not "nearly free" by any stretch of the imagination.
I was able to parallelize the program by replacing "map" with "parMap
rnf". No need to mess with Unix processes or file handles; you just change
one line of code and add two small command line options. I call
that "nearly free".
You can put my "invoke" function in a library called "extralibs" and pretend
it is a standard if it really means that much to you.
Post by Florian Kreidler
Post by Jon Harrop
1. My OCaml is orders of magnitude faster at 0.34s.
2. The overhead of your approach to parallelism is so inefficient that
the whole program sees performance worsen beyond only 6 cores.
Sorry, but I can't reproduce your results. On my computer, the Haskell
program that uses the uvector library is about 20 times faster,
That agrees with the timings I just gave.
Post by Florian Kreidler
Post by Jon Harrop
The theoretical points you raise are devoid of merit. Haskell may well be
facilitating many optimizations internally but if the code remains a lot
slower externally, those optimizations are useless. GHC may well make
parallelism easy but if that parallelism fails to give a performance
advantage, it is useless.
Right. But that's not the case. At least not on my computer.
But it is the case overall: Haskell is 50% to 660x slower than OCaml.

Granted that you have apparently found a setup where Haskell runs slowly and
OCaml runs even more slowly but that is of little interest in the context
of high-performance numerics. Anyone interesting in high-performance
computing will not be running their CPU in legacy 32-bit mode...
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
Manlio Perillo
2009-02-21 21:23:56 UTC
Permalink
Post by Jon Harrop
[...]
Now that I have managed to get them to compile it turns out they are
$ time ./mandelbrot +RTS -N7
real 0m0.551s
user 0m2.348s
sys 0m0.008s
My OCaml is not only faster but it didn't require any of this hand
tweaking that cannot be done on real programs and it scales equivalently
to the last measurement above, where Haskell is 24x slower than OCaml.
I'm not sure.

Your OCaml version spawns *a lot* of processes (and they are not properly
killed, since I see many <defunc> in top), stress the system (the load
average reach 4.2), and takes 0.34 seconds.

The best Haskell version, using 7 worker threads, takes 0.551 seconds
(so, it is 1.6x slower) and the load average is about 0.6.



Manlio Perillo
Jon Harrop
2009-02-22 18:03:08 UTC
Permalink
Post by Manlio Perillo
Post by Jon Harrop
My OCaml is not only faster but it didn't require any of this hand
tweaking that cannot be done on real programs and it scales equivalently
to the last measurement above, where Haskell is 24x slower than OCaml.
I'm not sure.
Your OCaml version spawns *a lot* of processes (and they are not properly
killed, since I see many <defunc> in top), stress the system (the load
average reach 4.2), and takes 0.34 seconds.
A few dozen processes is not "*a lot*" and Linux handles it just fine. Also,
the processes are reaped correctly, just not at the earliest possible time.
Post by Manlio Perillo
The best Haskell version, using 7 worker threads, takes 0.551 seconds
(so, it is 1.6x slower) and the load average is about 0.6.
The Haskell probably has a lower load average and takes longer because it
spends too long performing unnecessary global synchronizations.
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
Manlio Perillo
2009-02-22 19:02:12 UTC
Permalink
Post by Jon Harrop
Post by Manlio Perillo
Post by Jon Harrop
My OCaml is not only faster but it didn't require any of this hand
tweaking that cannot be done on real programs and it scales
equivalently to the last measurement above, where Haskell is 24x
slower than OCaml.
I'm not sure.
Your OCaml version spawns *a lot* of processes (and they are not
properly killed, since I see many <defunc> in top), stress the system
(the load average reach 4.2), and takes 0.34 seconds.
A few dozen processes is not "*a lot*" and Linux handles it just fine.
It's not "a few dozen".

I have tried to increment the number of iterations, by a factor of 10.
Running the program (on an Intel Core2 CPU, T7200) simply made my system
freeze.
Post by Jon Harrop
Also, the processes are reaped correctly, just not at the earliest
possible time.
Post by Manlio Perillo
The best Haskell version, using 7 worker threads, takes 0.551 seconds
(so, it is 1.6x slower) and the load average is about 0.6.
The Haskell probably has a lower load average and takes longer because
it spends too long performing unnecessary global synchronizations.
Yes, it is possible.


Manlio Perillo
Jon Harrop
2009-02-22 19:16:49 UTC
Permalink
Post by Manlio Perillo
Post by Jon Harrop
Post by Manlio Perillo
Post by Jon Harrop
My OCaml is not only faster but it didn't require any of this hand
tweaking that cannot be done on real programs and it scales
equivalently to the last measurement above, where Haskell is 24x
slower than OCaml.
I'm not sure.
Your OCaml version spawns *a lot* of processes (and they are not
properly killed, since I see many <defunc> in top), stress the system
(the load average reach 4.2), and takes 0.34 seconds.
A few dozen processes is not "*a lot*" and Linux handles it just fine.
It's not "a few dozen".
No, it really is only a few dozen for the given problem.
Post by Manlio Perillo
I have tried to increment the number of iterations, by a factor of 10.
Running the program (on an Intel Core2 CPU, T7200) simply made my system
freeze.
Sounds like you changed the program to increase the number of threads
spawned to hundreds instead of dozens, in which case it will grind to a
halt. However, you can easily tweak the program to handle other problems as
well.
Post by Manlio Perillo
Post by Jon Harrop
Post by Manlio Perillo
The best Haskell version, using 7 worker threads, takes 0.551 seconds
(so, it is 1.6x slower) and the load average is about 0.6.
The Haskell probably has a lower load average and takes longer because
it spends too long performing unnecessary global synchronizations.
Yes, it is possible.
What is possible?
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
Manlio Perillo
2009-02-24 11:55:47 UTC
Permalink
Post by Jon Harrop
[...]
Post by Manlio Perillo
I have tried to increment the number of iterations, by a factor of 10.
Running the program (on an Intel Core2 CPU, T7200) simply made my
system freeze.
Sounds like you changed the program to increase the number of threads
spawned to hundreds instead of dozens,
I have just changed the value of max_iterations from 65536 to 655360.
It if your program that spawn processes without control.
Post by Jon Harrop
in which case it will grind to a
halt. However, you can easily tweak the program to handle other problems
as well.
And this will surely make the program much more complex than the Haskell
version.
Post by Jon Harrop
Post by Manlio Perillo
Post by Jon Harrop
Post by Manlio Perillo
The best Haskell version, using 7 worker threads, takes 0.551 seconds
(so, it is 1.6x slower) and the load average is about 0.6.
The Haskell probably has a lower load average and takes longer because
it spends too long performing unnecessary global synchronizations.
Yes, it is possible.
What is possible?
That GHC "spends too long performing unnecessary global synchronizations".



Regards Manlio Perillo
Jon Harrop
2009-02-24 15:04:12 UTC
Permalink
Post by Manlio Perillo
Post by Jon Harrop
Post by Manlio Perillo
I have tried to increment the number of iterations, by a factor of 10.
Running the program (on an Intel Core2 CPU, T7200) simply made my
system freeze.
Sounds like you changed the program to increase the number of threads
spawned to hundreds instead of dozens,
I have just changed the value of max_iterations from 65536 to 655360.
It if your program that spawn processes without control.
That is incorrect: it spawns one process per row and, therefore, will still
only spawn a few dozen processes. Hence that runs perfectly here.

If you increase the height of the image then it will spawn more processes
and *that* will eventually cripple your machine but increasing the number
of iterations will not.

You may experience a slowdown while the program runs but, of course, that is
simply because my OCaml code is making efficient use of your machine's
compute power.
Post by Manlio Perillo
Post by Jon Harrop
in which case it will grind to a
halt. However, you can easily tweak the program to handle other problems
as well.
And this will surely make the program much more complex than the Haskell
version.
That is incorrect for two reasons:

1. Florian's Haskell code also chunks computations by row and, therefore,
will also parallelize badly on both short rows and many rows.

2. The workaround requires only two more lines of code in the OCaml: you
just partition the actual number of rows into a constant number of work
items.
Post by Manlio Perillo
Post by Jon Harrop
Post by Manlio Perillo
Post by Jon Harrop
Post by Manlio Perillo
The best Haskell version, using 7 worker threads, takes 0.551 seconds
(so, it is 1.6x slower) and the load average is about 0.6.
The Haskell probably has a lower load average and takes longer because
it spends too long performing unnecessary global synchronizations.
Yes, it is possible.
What is possible?
That GHC "spends too long performing unnecessary global synchronizations".
Yes, it is serious and well-known limitation of GHC's current design but GHC
still represents the state-of-the-art in the Haskell world.

The state-of-the-art for load balanced parallelism is Cilk-style wait-free
work-stealing queues. Haskell does not have them. Indeed, I doubt Haskell
can even express them. Microsoft's Task Parallel Library provides this
for .NET languages.

The state-of-the-art for high-level language implementations is a mostly
concurrent GC. Haskell does not have one (GHC suspends all threads for a
non-concurrent collection, destroying both interactivity and scalability).
Java and .NET both have scalable mostly concurrent GCs with low pause
times.

Florian referring to the Haskell as "state-of-the-art" in this context when
its foundation is years behind what people are already using in industry is
absurd. Parnell trying to pretend that these benchmark results are not an
embarrassing failure for functional languages does nothing to improve the
situation.

The situation can theoretically be improved but none of today's standlone
FPL implementations come close to what is already available on the JVM
and .NET.
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
Florian Kreidler
2009-02-24 16:25:01 UTC
Permalink
Post by Jon Harrop
The state-of-the-art for load balanced parallelism is Cilk-style wait-free
work-stealing queues...
The state-of-the-art for high-level language implementations is a mostly
concurrent GC...
Florian referring to the Haskell as "state-of-the-art" in this context when
its foundation is years behind what people are already using in industry is
absurd.
Again, he is misquoting me. I called Haskell's uvector package a
state-of-the-art library. Nothing else.
parnell
2009-02-24 17:44:56 UTC
Permalink
Post by Jon Harrop
Post by Manlio Perillo
Post by Jon Harrop
Post by Manlio Perillo
I have tried to increment the number of iterations, by a factor of 10.
Running the program (on an Intel Core2 CPU, T7200) simply made my
system freeze.
Sounds like you changed the program to increase the number of threads
spawned to hundreds instead of dozens,
I have just changed the value of max_iterations from 65536 to 655360.
It if your program that spawn processes without control.
That is incorrect: it spawns one process per row and, therefore, will still
only spawn a few dozen processes. Hence that runs perfectly here.
If you increase the height of the image then it will spawn more processes
and *that* will eventually cripple your machine but increasing the number
of iterations will not.
You may experience a slowdown while the program runs but, of course, that is
simply because my OCaml code is making efficient use of your machine's
compute power.
Post by Manlio Perillo
Post by Jon Harrop
in which case it will grind to a
halt. However, you can easily tweak the program to handle other problems
as well.
And this will surely make the program much more complex than the Haskell
version.
1. Florian's Haskell code also chunks computations by row and, therefore,
will also parallelize badly on both short rows and many rows.
2. The workaround requires only two more lines of code in the OCaml: you
just partition the actual number of rows into a constant number of work
items.
Post by Manlio Perillo
Post by Jon Harrop
Post by Manlio Perillo
Post by Jon Harrop
Post by Manlio Perillo
The best Haskell version, using 7 worker threads, takes 0.551 seconds
(so, it is 1.6x slower) and the load average is about 0.6.
The Haskell probably has a lower load average and takes longer because
it spends too long performing unnecessary global synchronizations.
Yes, it is possible.
What is possible?
That GHC "spends too long performing unnecessary global synchronizations".
Yes, it is serious and well-known limitation of GHC's current design but GHC
still represents the state-of-the-art in the Haskell world.
The state-of-the-art for load balanced parallelism is Cilk-style wait-free
work-stealing queues. Haskell does not have them. Indeed, I doubt Haskell
can even express them. Microsoft's Task Parallel Library provides this
for .NET languages.
The state-of-the-art for high-level language implementations is a mostly
concurrent GC. Haskell does not have one (GHC suspends all threads for a
non-concurrent collection, destroying both interactivity and scalability).
Java and .NET both have scalable mostly concurrent GCs with low pause
times.
Florian referring to the Haskell as "state-of-the-art" in this context when
its foundation is years behind what people are already using in industry is
absurd. Parnell trying to pretend that these benchmark results are not an
embarrassing failure for functional languages does nothing to improve the
situation.
I thought we were comparing functional languages, specifically OCaml,
F#, and Haskell? I would gladly voice my displeasure if I were
presented with a concurrent C++ version, or .Net or Java version that
was significantly faster than Florian's uvector version.

Clik is a very interesting technology and is very much state of the
art. I have not had the pleasure of using it and if it truly scales
linearly as cores are added then it is far ahead of the CLR, JVM, and
GHC.
Paul Rubin
2009-02-24 19:26:04 UTC
Permalink
Post by parnell
Clik is a very interesting technology and is very much state of the
art. I have not had the pleasure of using it and if it truly scales
linearly as cores are added then it is far ahead of the CLR, JVM, and
GHC.
At some point in this, Data Parallel Haskell should also come into play.
I think there are even some extensions in the works for it to offload
some of the computation onto an ultra-parallel GPGPU graphics card.
Multi-core is nothing compared to that ;-).
Jon Harrop
2009-02-24 22:32:38 UTC
Permalink
Post by Paul Rubin
Post by parnell
Clik is a very interesting technology and is very much state of the
art. I have not had the pleasure of using it and if it truly scales
linearly as cores are added then it is far ahead of the CLR, JVM, and
GHC.
At some point in this, Data Parallel Haskell should also come into play.
I think there are even some extensions in the works for it to offload
some of the computation onto an ultra-parallel GPGPU graphics card.
Multi-core is nothing compared to that ;-).
Again, F# had it before Haskell. See the Accelerator project from MSR.
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
Jon Harrop
2009-02-24 22:31:18 UTC
Permalink
Post by parnell
Post by Jon Harrop
Florian referring to the Haskell as "state-of-the-art" in this context
when its foundation is years behind what people are already using in
industry is absurd. Parnell trying to pretend that these benchmark
results are not an embarrassing failure for functional languages does
nothing to improve the situation.
I thought we were comparing functional languages, specifically OCaml,
F#, and Haskell?
I thought we were comparing functional languages and style with other
languages and styles. Imperative style in OCaml is faster than functional
style in OCaml and Haskell and imperative C is much faster still. There is
no excuse for that.
Post by parnell
I would gladly voice my displeasure if I were presented with a concurrent
C++ version, or .Net or Java version that was significantly faster than
Florian's uvector version.
Well, here is a simple parallel F# implementation that runs on .NET:

#light

open Math.Complex

let n = 78
let max_iterations = 65536

let iterate i j =
let cr, ci = float(i - n/2) / 40. - 0.5, float(j - n/2) / 40.
let rec loop zr zi i =
if i > max_iterations then '*' else
let zr2 = zr * zr
let zi2 = zi * zi
if zi2 + zr2 > 4.0 then ' ' else
loop (zr2 - zi2 + cr) (2. * zr * zi + ci) (i + 1)
loop cr ci 1

do
let t = System.Diagnostics.Stopwatch.StartNew()
let ss =
seq { for j in 0 .. n-1 ->
async { return Array.init n (iterate j) } }
for s in Async.Run(Async.Parallel ss) do
Array.iter print_char s
printf "\n"
printfn "\n%dms" t.ElapsedMilliseconds

I've used the inefficient thread pool instead of the TPL because I want to
run it on Mono 2.2, where it turns out to be as fast as the Haskell on 8
cores.

From the SciMark2 benchmark results I'd guess .NET would be 50% faster
again, i.e. as fast as the OCaml. But that is still many times slower than
necessary and F# does not optimize complex arithmetic.
Post by parnell
Clik is a very interesting technology and is very much state of the
art. I have not had the pleasure of using it and if it truly scales
linearly as cores are added then it is far ahead of the CLR, JVM, and
GHC.
AFAIK the JVM and CLR both scale extremely well on GC intensive tasks and
this task should not even require any GC. IIRC, the CLR GC runs 17x faster
on 32 cores which is pretty good scaling.

Here is a trivial imperative Cilk implementation that is several times
faster than all of the parallel functional implementations on my machine:

#include <cilk-lib.cilkh>
#include <stdio.h>
#include <stdlib.h>

int max_i = 65536;

char loop(double cr, double ci) {
double zr=cr, zi=ci;
int i=1;
while (zr*zr + zi*zi <= 4.0) {
double ozr=zr;
zr = zr*zr - zi*zi + cr;
zi = 2*ozr*zi + ci;
if (i++ == max_i) return '*';
}
return ' ';
}

const int n=78;

cilk void row(char *s, int j0, int j2) {
if (j2-j0 == 1) {
int i=0;
for (i=0; i<n; ++i)
s[i+j0*(n+1)] = loop((j0-n/2)/40.0-0.5, ((i-n/2)/40.0));
s[n+j0*(n+1)] = '\n';
} else {
int j1 = (j0 + j2)/2;
spawn row(s, j0, j1);
spawn row(s, j1, j2);
}
sync;
}

cilk int main() {
char *s = malloc((n+1)*n+1);
spawn row(s, 0, n);
sync;
printf("%s", s);
free(s);
return 0;
}

This has the following characteristics:

. Recursive divide and conquer is used in the "row" function to create work
items that are worth stealing because they are likely to spawn many
children.

. Run-time performance is 2x faster on 1 core and 4x faster on 8 cores than
the fastest Haskell. So the Cilk scales a lot better than the Haskell.

. Uses individual doubles because GCC does a poor job in optimizing C99
complex arithmetic. There is no excuse for that either.
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
parnell
2009-02-24 23:06:06 UTC
Permalink
Post by Jon Harrop
Post by parnell
Post by Jon Harrop
Florian referring to the Haskell as "state-of-the-art" in this context
when its foundation is years behind what people are already using in
industry is absurd. Parnell trying to pretend that these benchmark
results are not an embarrassing failure for functional languages does
nothing to improve the situation.
I thought we were comparing functional languages, specifically OCaml,
F#, and Haskell?
I thought we were comparing functional languages and style with other
languages and styles. Imperative style in OCaml is faster than functional
style in OCaml and Haskell and imperative C is much faster still. There is
no excuse for that.
Post by parnell
I would gladly voice my displeasure if I were presented with a concurrent
C++ version, or .Net or Java version that was significantly faster than
Florian's uvector version.
#light
open Math.Complex
let n = 78
let max_iterations = 65536
let iterate i j =
  let cr, ci = float(i - n/2) / 40. - 0.5, float(j - n/2) / 40.
  let rec loop zr zi i =
    if i > max_iterations then '*' else
      let zr2 = zr * zr
      let zi2 = zi * zi
      if zi2 + zr2 > 4.0 then ' ' else
        loop (zr2 - zi2 + cr) (2. * zr * zi + ci) (i + 1)
  loop cr ci 1
do
  let t = System.Diagnostics.Stopwatch.StartNew()
  let ss =
    seq { for j in 0 .. n-1 ->
            async { return Array.init n (iterate j) } }
  for s in Async.Run(Async.Parallel ss) do
    Array.iter print_char s
    printf "\n"
  printfn "\n%dms" t.ElapsedMilliseconds
I've used the inefficient thread pool instead of the TPL because I want to
run it on Mono 2.2, where it turns out to be as fast as the Haskell on 8
cores.
Thanks for the examples Jon.
I ran your F# and Florian's uvector version on a windows installation,
I think that I need to update my F# and .NET libraries because I was
not able to get the same results you did.

.NET 2.0
F# 1.9.4.15
Intel Dual Core 3.0 GHz
Windows XP Service Pack 2

Your F# code
.995 seconds

Florian's Haskell:
.757 seconds
Post by Jon Harrop
From the SciMark2 benchmark results I'd guess .NET would be 50% faster
again, i.e. as fast as the OCaml. But that is still many times slower than
necessary and F# does not optimize complex arithmetic.
Post by parnell
Clik is a very interesting technology and is very much state of the
art.  I have not had the pleasure of using it and if it truly scales
linearly as cores are added then it is far ahead of the CLR, JVM, and
GHC.
AFAIK the JVM and CLR both scale extremely well on GC intensive tasks and
this task should not even require any GC. IIRC, the CLR GC runs 17x faster
on 32 cores which is pretty good scaling.
Yes this is excellent, but not linear.
Post by Jon Harrop
Here is a trivial imperative Cilk implementation that is several times
#include <cilk-lib.cilkh>
#include <stdio.h>
#include <stdlib.h>
int max_i = 65536;
char loop(double cr, double ci) {
    double zr=cr, zi=ci;
    int i=1;
    while (zr*zr + zi*zi <= 4.0) {
        double ozr=zr;
        zr = zr*zr - zi*zi + cr;
        zi = 2*ozr*zi + ci;
        if (i++ == max_i) return '*';
    }
    return ' ';
}
const int n=78;
cilk void row(char *s, int j0, int j2) {
    if (j2-j0 == 1) {
        int i=0;
        for (i=0; i<n; ++i)
            s[i+j0*(n+1)] = loop((j0-n/2)/40.0-0.5, ((i-n/2)/40.0));
        s[n+j0*(n+1)] = '\n';
    } else {
        int j1 = (j0 + j2)/2;
        spawn row(s, j0, j1);
        spawn row(s, j1, j2);
    }
    sync;
}
cilk int main() {
    char *s = malloc((n+1)*n+1);
    spawn row(s, 0, n);
    sync;
    printf("%s", s);
    free(s);
    return 0;
}
. Recursive divide and conquer is used in the "row" function to create work
items that are worth stealing because they are likely to spawn many
children.
. Run-time performance is 2x faster on 1 core and 4x faster on 8 cores than
the fastest Haskell. So the Cilk scales a lot better than the Haskell.
. Uses individual doubles because GCC does a poor job in optimizing C99
complex arithmetic. There is no excuse for that either.
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.http://www.ffconsultancy.com/?u
Jon Harrop
2009-02-24 23:57:16 UTC
Permalink
Post by parnell
Thanks for the examples Jon.
I ran your F# and Florian's uvector version on a windows installation,
I think that I need to update my F# and .NET libraries because I was
not able to get the same results you did.
.NET 2.0
F# 1.9.4.15
Intel Dual Core 3.0 GHz
Windows XP Service Pack 2
Your F# code
.995 seconds
.757 seconds
I get 0.834s for F# 1.9.6.2 on .NET 3.5 SP1 on my dual core 2.2GHz Athlon 64
X2.

I'm surprised your 3GHz Intel is running slower than my 2.2GHz Athlon.
Exactly what CPU do you have?

I cannot be bothered to go through that Haskell install on Windows. I'm
toying with the idea of buying another 8 core for Windows but F# sales will
have to pick up to make it worth my while. ;-)
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
parnell
2009-02-25 14:15:03 UTC
Permalink
Post by Jon Harrop
Post by parnell
Thanks for the examples Jon.
I ran your F# and Florian's uvector version on a windows installation,
I think that I need to update my F# and .NET libraries because I was
not able to get the same results you did.
.NET 2.0
F# 1.9.4.15
Intel Dual Core 3.0 GHz
Windows XP Service Pack 2
Your F# code
.995 seconds
.757 seconds
I get 0.834s for F# 1.9.6.2 on .NET 3.5 SP1 on my dual core 2.2GHz Athlon 64
X2.
I'm surprised your 3GHz Intel is running slower than my 2.2GHz Athlon.
Exactly what CPU do you have?
I cannot be bothered to go through that Haskell install on Windows. I'm
toying with the idea of buying another 8 core for Windows but F# sales will
have to pick up to make it worth my while. ;-)
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.http://www.ffconsultancy.com/?u
I am not sure what was going on with my box when I ran those tests
yesterday, but I am getting much better performance today after a
reboot (I had shut off all applications last night but I guess
something was lurking).

Today after 7 runs for each I get these results:

F#
longest time .861 seconds
shortest time .785 seconds
mean time .797 seconds


Haskell
longest time .721 seconds
shortest time .669 seconds
mean time .677 seconds

The exact CPU is:
Intel Core2 Duo CPU (Xeon?).

Forget the 8 core Windows box and keep working on the LLVM! ;-)


Parnell
Manlio Perillo
2009-02-24 21:43:41 UTC
Permalink
Post by Jon Harrop
Post by Manlio Perillo
Post by Jon Harrop
Post by Manlio Perillo
I have tried to increment the number of iterations, by a factor of
10. Running the program (on an Intel Core2 CPU, T7200) simply made my
system freeze.
Sounds like you changed the program to increase the number of threads
spawned to hundreds instead of dozens,
I have just changed the value of max_iterations from 65536 to 655360.
It if your program that spawn processes without control.
That is incorrect: it spawns one process per row and, therefore, will
still only spawn a few dozen processes. Hence that runs perfectly here.
If you increase the height of the image then it will spawn more
processes and *that* will eventually cripple your machine but increasing
the number of iterations will not.
You may experience a slowdown while the program runs but, of course,
that is simply because my OCaml code is making efficient use of your
machine's compute power.
This is not what I see.
As I have said, I only incremented the number of max_iterarions.
And I can see at least 40 processes running at the same time.

A say "at least", because I have to stop the execution after few seconds.


Can you verify this?
Post by Jon Harrop
[...]
Manlio Perillo
Jon Harrop
2009-02-24 23:51:42 UTC
Permalink
Post by Manlio Perillo
This is not what I see.
As I have said, I only incremented the number of max_iterarions.
And I can see at least 40 processes running at the same time.
That's the same as before except they hang around longer so you get to look
at them.
Post by Manlio Perillo
A say "at least", because I have to stop the execution after few seconds.
Can you verify this?
No. My OCaml with max_iterations at 655360 spawns dozens of processes (top
shows a page full of them) and completes in under 6s. I just tested with
6553600 and that also runs fine. My computer slows down but remains usable.

FWIW, I'm running:

$ uname -a
Linux leper 2.6.24-etchnhalf.1-amd64 #1 SMP Mon Jul 21 12:37:11 UTC 2008
x86_64 GNU/Linux
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
William James
2009-02-21 01:13:05 UTC
Permalink
Post by Michael Oswald
Hm, looks a lot cleaner than my solution, learned some new things, thanks!
Anyway, since the current machine I'm on is no multicore (it doesn't
have even hyperthreading), I edited your example and replaced parMap
with a normal map.
set runs to 1 and max_iterations to 99888 for all programs, then
C++:compiled with gcc 3.3.3 with -O3
Ocaml: with ocalopt 3.11.0
Haskell: both with ghc 6.10.1 and -fvia-C -O2 -fexcess-precision
-optc-ffast-math -optc-O3
2.669u 0.010s 0:02.91 91.7% 0+0k 0+0io 0pf+0w
11.682u 0.023s 0:12.09 96.7% 0+0k 0+0io 0pf+0w
16.190u 0.030s 0:18.84 86.0% 0+0k 0+0io 0pf+0w
20.034u 0.045s 0:23.20 86.5% 0+0k 0+0io 0pf+0w
C++ Ocaml Haskell_1 Haskell_2
1 4.38 6.07 7.5
C++ is unbeatable in this example. This can be expected, since -O3
performs complete loop unrolling optimization.
F# on a 2 GHz laptop (Pentium M; windozeXP)
2.133 sec. (runs=1, max_iterations=99888)
Jon Harrop
2009-02-20 18:48:28 UTC
Permalink
Post by Florian Kreidler
That looks quite verbose. :)
But it is standalone vanilla OCaml that compiles and runs out of the box on
most Unices including OS X.
Post by Florian Kreidler
---------------------------
module Main(main) where
import Control.Parallel.Strategies
max_iterations :: Int
max_iterations = 65536
iterate :: Double -> Double -> Char
iterate ci cr = loop 0.0 0.0 1
where
loop zi zr i
| i > max_iterations = '*'
| zi2 + zr2 > 4.0 = ' '
| otherwise = loop zi' zr' (i + 1)
where temp = zr * zi
zr2 = zr * zr
zi2 = zi * zi
zi' = temp + temp + ci
zr' = zr2 - zi2 + cr
mandelbrot :: String
mandelbrot = unlines image
where image = parMap rnf line [-39..38] -- (*)
line y = map (pixel y) [-39..38]
pixel y x = Main.iterate (x / 40.0) (y/40.0-0.5)
main :: IO ()
main = putStrLn mandelbrot
---------------------------
Compile with options
-fviaa-C -O2 -fexcess-precision -optc-ffast-math -optc-O3 -threaded
This is certainly cleaner looking code but I cannot compile it. Looks
like -fviaa-C should be -fvia-C but then I get:

$ ghc -fvia-C -O2 -fexcess-precision -optc-ffast-math -optc-O3 -threaded
mandelbrot.hs -o mandelbrot

mandelbrot.hs:3:0:
Failed to load interface for `Control.Parallel.Strategies':
Use -v to see a list of the files searched for.

Haskell always seems to have awful library problems to me. I would have
hoped that installing the "parallel" library would solve this problem:

$ sudo apt-get install libghc6-parallel-dev
...
$

but now the compiler dies with an intelligable internal error:

$ ghc -fvia-C -O2 -fexcess-precision -optc-ffast-math -optc-O3 -threaded
mandelbrot.hs -o mandelbrot
mandelbrot.o: In function `__stginit_Main_':
ghc29271_0.hc:(.text+0x78a): undefined reference to
`__stginit_parallelzm1zi0zi0zi0_ControlziParallelziStrategies_'
mandelbrot.o: In function `Main_lvl5_info':
ghc29271_0.hc:(.text+0x686): undefined reference to
`parallelzm1zi0zi0zi0_ControlziParallelziStrategies_parList_info'
collect2: ld returned 1 exit status

How should I proceed?
Post by Florian Kreidler
(*) Function parMap evaluates the list elements in parallel, applying
strategy rnf to each element. Strategy rnf evaluates its argument
completely.
The "invoke" function from my OCaml forks a parallel process and
communicates its result back in order to implement a future. That can be
used to parallelize some naive programs like this one in OCaml but there is
no substitute for mutable shared memory.

F# is the only functional language to have decent libraries for parallelism
AFAIK...
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
Florian Kreidler
2009-02-20 19:09:45 UTC
Permalink
Post by Jon Harrop
But it is standalone vanilla OCaml that compiles and runs out of the box on
most Unices including OS X.
I have installed the Debian packages for ocaml, but

$ ocamlopt mand.ml

complains that no implementation for module Unix is provided.
Post by Jon Harrop
This is certainly cleaner looking code but I cannot compile it. Looks
$ ghc -fvia-C -O2 -fexcess-precision -optc-ffast-math -optc-O3 -threaded
mandelbrot.hs -o mandelbrot
Use -v to see a list of the files searched for.
Haskell always seems to have awful library problems to me. I would have
$ sudo apt-get install libghc6-parallel-dev
With cabal, you can install it locally via

$ cabal install parallel

It will pull in all required dependencies automatically.
Post by Jon Harrop
$ ghc -fvia-C -O2 -fexcess-precision -optc-ffast-math -optc-O3 -threaded
mandelbrot.hs -o mandelbrot
ghc29271_0.hc:(.text+0x78a): undefined reference to
`__stginit_parallelzm1zi0zi0zi0_ControlziParallelziStrategies_'
ghc29271_0.hc:(.text+0x686): undefined reference to
`parallelzm1zi0zi0zi0_ControlziParallelziStrategies_parList_info'
collect2: ld returned 1 exit status
How should I proceed?
It does not link in the required libraries, because you have omitted
the command line switch "--make".
Post by Jon Harrop
Post by Florian Kreidler
(*) Function parMap evaluates the list elements in parallel, applying
strategy rnf to each element. Strategy rnf evaluates its argument
completely.
The "invoke" function from my OCaml forks a parallel process and
communicates its result back in order to implement a future. That can be
used to parallelize some naive programs like this one in OCaml but there is
no substitute for mutable shared memory.
That sounds very complicated. I don't want to care about mutable state when
I implement a simple parallel calculation. For multi-threaded programming,
Haskell also has synchronized shared state (MVar), but here that would be
overkill.
Post by Jon Harrop
F# is the only functional language to have decent libraries for parallelism
AFAIK...
Depends on your definition of "decent". Does it have something like
Haskell's par-Operator, so that a calculation can be split into two
parallel threads, like in

average xs = let s = sum xs
l = length xs
in (s `par` l) `seq` s `div` l

?
William James
2009-02-21 01:18:23 UTC
Permalink
Post by Florian Kreidler
I have installed the Debian packages for ocaml, but
$ ocamlopt mand.ml
complains that no implementation for module Unix is provided.
ocamlopt unix.cmxa mand.ml
Florian Kreidler
2009-02-21 01:13:27 UTC
Permalink
Post by William James
Post by Florian Kreidler
I have installed the Debian packages for ocaml, but
$ ocamlopt mand.ml
complains that no implementation for module Unix is provided.
ocamlopt unix.cmxa mand.ml
Thank you. It takes 4.5 seconds on two cores, compared to 1.2
seconds for the sequential C++ version, 1.8 seconds for the
sequential Haskell version that uses unboxed arrays, and 1.2
seconds for the parallel Haskell version with unboxed arrays
on two cores.
Jon Harrop
2009-02-21 08:54:54 UTC
Permalink
Thank you. It takes 4.5 seconds on two cores...
Sounds fishy. The OCaml takes 0.34s here...
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
William James
2009-02-21 09:22:10 UTC
Permalink
Post by Jon Harrop
Thank you. It takes 4.5 seconds on two cores...
Sounds fishy. The OCaml takes 0.34s here...
On my 2 GHz laptop:

6.68 seconds
Jon Harrop
2009-02-21 10:04:21 UTC
Permalink
Post by William James
Post by Jon Harrop
Thank you. It takes 4.5 seconds on two cores...
Sounds fishy. The OCaml takes 0.34s here...
6.68 seconds
Are you running in a 32-bit OS?
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
William James
2009-02-21 19:38:49 UTC
Permalink
Post by Jon Harrop
Post by William James
Post by Jon Harrop
Thank you. It takes 4.5 seconds on two cores...
Sounds fishy. The OCaml takes 0.34s here...
6.68 seconds
Are you running in a 32-bit OS?
windows xp

--
Jon Harrop
2009-02-21 09:37:45 UTC
Permalink
Post by Florian Kreidler
Post by Jon Harrop
But it is standalone vanilla OCaml that compiles and runs out of the box
on most Unices including OS X.
I have installed the Debian packages for ocaml, but
$ ocamlopt mand.ml
complains that no implementation for module Unix is provided.
The Unix module is part of OCaml's standard library. Compile it with:

ocamlopt unix.cmxa mandelbrot.ml -o mandelbrot

OCaml's standard library also includes a regex implementation (Str module)
and even Tk bindings (for ocamlbrowser).
Post by Florian Kreidler
Post by Jon Harrop
$ sudo apt-get install libghc6-parallel-dev
With cabal, you can install it locally via
$ cabal install parallel
But Cabal itself is not available as a Debian package, so I have to install
Haskell's own proprietary package manager by hand:

$ wget
http://www.haskell.org/cabal/release/cabal-1.6.0.2/Cabal-1.6.0.2.tar.gz
...
$ gunzip <Cabal-1.6.0.2.tar.gz | tar xv
$ runhaskell Setup.hs configure
Configuring Cabal-1.6.0.2...
$ runhaskell Setup.hs build
Preprocessing library Cabal-1.6.0.2...
Building Cabal-1.6.0.2...
[ 1 of 51] Compiling Distribution.Simple.GHC.Makefile (
Distribution/Simple/GHC/Makefile.hs,
dist/build/Distribution/Simple/GHC/Makefile.o )
...
$ sudo runhaskell Setup.hs install
Installing library in /usr/local/lib/Cabal-1.6.0.2/ghc-6.8.2
...

And, err, I still don't have a "cabal" program...
Post by Florian Kreidler
It will pull in all required dependencies automatically.
Post by Jon Harrop
$ ghc -fvia-C -O2 -fexcess-precision -optc-ffast-math -optc-O3 -threaded
mandelbrot.hs -o mandelbrot
ghc29271_0.hc:(.text+0x78a): undefined reference to
`__stginit_parallelzm1zi0zi0zi0_ControlziParallelziStrategies_'
ghc29271_0.hc:(.text+0x686): undefined reference to
`parallelzm1zi0zi0zi0_ControlziParallelziStrategies_parList_info'
collect2: ld returned 1 exit status
How should I proceed?
It does not link in the required libraries, because you have omitted
the command line switch "--make".
That compiles:

$
ghc --make -fvia-C -O2 -fexcess-precision -optc-ffast-math -optc-O3 -threaded
mandelbrot.hs -o mandelbrot

But it now runs extremely slowly:

$ time ./mandelbrot +RTS -N16
...
real 3m44.084s
user 26m10.894s
sys 0m1.724s

That is 660x slower than OCaml.
Post by Florian Kreidler
Post by Jon Harrop
Post by Florian Kreidler
(*) Function parMap evaluates the list elements in parallel, applying
strategy rnf to each element. Strategy rnf evaluates its argument
completely.
The "invoke" function from my OCaml forks a parallel process and
communicates its result back in order to implement a future. That can be
used to parallelize some naive programs like this one in OCaml but there
is no substitute for mutable shared memory.
That sounds very complicated. I don't want to care about mutable state
when I implement a simple parallel calculation...
Then you will not benefit from parallelism in most cases.
Post by Florian Kreidler
Post by Jon Harrop
F# is the only functional language to have decent libraries for
parallelism AFAIK...
Depends on your definition of "decent".
Concurrent GC and wait-free work-stealing queues.
Post by Florian Kreidler
Does it have something like Haskell's par-Operator, so that a calculation
can be split into two parallel threads, like in
average xs = let s = sum xs
l = length xs
in (s `par` l) `seq` s `div` l
?
A good foundation is more important than syntax.
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
Florian Kreidler
2009-02-21 12:30:52 UTC
Permalink
Post by Jon Harrop
But Cabal itself is not available as a Debian package, so I have to install
$ wget
http://www.haskell.org/cabal/release/cabal-1.6.0.2/Cabal-1.6.0.2.tar.gz
...
And, err, I still don't have a "cabal" program...
You didn't read www.haskell.org/cabal, did you? The command line interface
is in package cabal-install. It contains a shell script that downloads the
required dependencies and builds the cabal binary. Or you can download a
windows executable there.
Post by Jon Harrop
$
ghc --make -fvia-C -O2 -fexcess-precision -optc-ffast-math -optc-O3 -threaded
mandelbrot.hs -o mandelbrot
$ time ./mandelbrot +RTS -N16
...
real 3m44.084s
user 26m10.894s
sys 0m1.724s
That is 660x slower than OCaml.
As I answered in the other posting: I can't reproduce that.
Post by Jon Harrop
Post by Florian Kreidler
Post by Jon Harrop
Post by Florian Kreidler
(*) Function parMap evaluates the list elements in parallel, applying
strategy rnf to each element. Strategy rnf evaluates its argument
completely.
The "invoke" function from my OCaml forks a parallel process and
communicates its result back in order to implement a future. That can be
used to parallelize some naive programs like this one in OCaml but there
is no substitute for mutable shared memory.
That sounds very complicated. I don't want to care about mutable state
when I implement a simple parallel calculation...
Then you will not benefit from parallelism in most cases.
That's why I wrote in the next sentence: "For multi-threaded programming,
Haskell also has synchronized shared state (MVar), but here that would be
overkill."
Post by Jon Harrop
Post by Florian Kreidler
Post by Jon Harrop
F# is the only functional language to have decent libraries for
parallelism AFAIK...
Depends on your definition of "decent".
Concurrent GC and wait-free work-stealing queues.
Post by Florian Kreidler
Does it have something like Haskell's par-Operator, so that a calculation
can be split into two parallel threads, like in
average xs = let s = sum xs
l = length xs
in (s `par` l) `seq` s `div` l
?
A good foundation is more important than syntax.
Sounds like "no". :)
Jon Harrop
2009-02-21 15:10:44 UTC
Permalink
Post by Florian Kreidler
Post by Jon Harrop
But Cabal itself is not available as a Debian package, so I have to
$ wget
http://www.haskell.org/cabal/release/cabal-1.6.0.2/Cabal-1.6.0.2.tar.gz
...
And, err, I still don't have a "cabal" program...
You didn't read www.haskell.org/cabal, did you? The command line interface
is in package cabal-install. It contains a shell script that downloads the
required dependencies and builds the cabal binary.
The script breaks:

$ ./bootstrap.sh
Checking installed packages for ghc-6.8.2...

The Haskell package 'parsec' is required but it is not installed.
If you are using a ghc package provided by your operating system
then install the corresponding packages for 'parsec' and 'network'.
If you built ghc from source with only the core libraries then you
should install these extra packages. You can get them from hackage.

Error during cabal-install bootstrap:
The Haskell package 'parsec' is required but it is not installed.

Turns out the two missing libraries are available via apt:

$ sudo apt-get install libghc6-parsec-dev
...
$ sudo apt-get install libghc6-network-dev
...

Now the bootstrap script from cabal-install at least runs but it installs
the executable in an undesirable place. I'll just install it by hand
myself...
Post by Florian Kreidler
Or you can download a windows executable there.
I'm running Linux.
Post by Florian Kreidler
Post by Jon Harrop
$
ghc --make -fvia-C -O2 -fexcess-precision -optc-ffast-math -optc-O3
-threaded mandelbrot.hs -o mandelbrot
$ time ./mandelbrot +RTS -N16
...
real 3m44.084s
user 26m10.894s
sys 0m1.724s
That is 660x slower than OCaml.
As I answered in the other posting: I can't reproduce that.
Ok.
Post by Florian Kreidler
Post by Jon Harrop
Post by Florian Kreidler
Post by Jon Harrop
Post by Florian Kreidler
(*) Function parMap evaluates the list elements in parallel, applying
strategy rnf to each element. Strategy rnf evaluates its argument
completely.
The "invoke" function from my OCaml forks a parallel process and
communicates its result back in order to implement a future. That can
be used to parallelize some naive programs like this one in OCaml but
there is no substitute for mutable shared memory.
That sounds very complicated. I don't want to care about mutable state
when I implement a simple parallel calculation...
Then you will not benefit from parallelism in most cases.
That's why I wrote in the next sentence: "For multi-threaded programming,
Haskell also has synchronized shared state (MVar), but here that would be
overkill."
I assume MVar is also slow?
Post by Florian Kreidler
Post by Jon Harrop
Post by Florian Kreidler
Post by Jon Harrop
F# is the only functional language to have decent libraries for
parallelism AFAIK...
Depends on your definition of "decent".
Concurrent GC and wait-free work-stealing queues.
Sounds like "no". :)
Indeed.
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
Florian Kreidler
2009-02-21 14:35:09 UTC
Permalink
Post by Jon Harrop
Post by Florian Kreidler
Post by Jon Harrop
Post by Florian Kreidler
Post by Jon Harrop
F# is the only functional language to have decent libraries for
parallelism AFAIK...
Depends on your definition of "decent".
Concurrent GC and wait-free work-stealing queues.
Sounds like "no". :)
Indeed.
Why do you misquote me?
Jon Harrop
2009-02-21 17:19:17 UTC
Permalink
Post by Florian Kreidler
Post by Jon Harrop
Post by Florian Kreidler
Post by Jon Harrop
Post by Florian Kreidler
Post by Jon Harrop
F# is the only functional language to have decent libraries for
parallelism AFAIK...
Depends on your definition of "decent".
Concurrent GC and wait-free work-stealing queues.
Sounds like "no". :)
Indeed.
Why do you misquote me?
I just stripped out the piffle. Why did you evade the question to focus on
unimportant syntactic issues?
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
Florian Kreidler
2009-02-21 16:31:30 UTC
Permalink
Post by Jon Harrop
Post by Florian Kreidler
Post by Jon Harrop
Post by Florian Kreidler
Post by Jon Harrop
Concurrent GC and wait-free work-stealing queues.
Sounds like "no". :)
Indeed.
Why do you misquote me?
I just stripped out the piffle. Why did you evade the question to focus on
unimportant syntactic issues?
You silently removed the part my answer "Sounds like 'no'. :)" referred to.

Besides that, I did not talk about syntactic issues. I talked about
Haskell's combinator par, which has no semantic equivalent in any strict
programming language.
Jon Harrop
2009-02-21 18:12:43 UTC
Permalink
Post by Florian Kreidler
Post by Jon Harrop
Post by Florian Kreidler
Post by Jon Harrop
Post by Florian Kreidler
Post by Jon Harrop
Concurrent GC and wait-free work-stealing queues.
Sounds like "no". :)
Indeed.
Why do you misquote me?
I just stripped out the piffle. Why did you evade the question to focus
on unimportant syntactic issues?
You silently removed the part my answer "Sounds like 'no'. :)" referred to.
I assumed you were replying to everything you had quoted.

Regardless, the fact remains that "Haskell's state-of-the-art libraries" are
not very good by design and, in particular, are many years behind what
people are already using on mainstream platforms in industry in terms of
the problems they solve.
Post by Florian Kreidler
Besides that, I did not talk about syntactic issues. I talked about
Haskell's combinator par, which has no semantic equivalent in any strict
programming language.
But there are functional equivalents that not only solve the same problem
but solve it significantly more efficiently.
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
r***@ps.uni-sb.de
2009-02-21 18:28:24 UTC
Permalink
Post by Florian Kreidler
Haskell's combinator par, which has no semantic equivalent in any strict
programming language.
I don't want to comment on the latest installment of The Grand Micro
Benchmark Wars on CLF itself. But regarding your statement above, one
such "semantic equivalent" obviously would be futures. For example, in
Alice ML -- an ML that has futures -- you could write:

fun average xs = (spawn sum xs) div (spawn length xs)

(Mind you, the Alice runtime isn't able to utilize multiple core's,
and is generally slow, but that's separate from semantics.)
Jon Harrop
2009-02-21 18:42:17 UTC
Permalink
Post by r***@ps.uni-sb.de
Post by Florian Kreidler
Haskell's combinator par, which has no semantic equivalent in any strict
programming language.
I don't want to comment on the latest installment of The Grand Micro
Benchmark Wars on CLF itself. But regarding your statement above, one
such "semantic equivalent" obviously would be futures. For example, in
fun average xs = (spawn sum xs) div (spawn length xs)
(Mind you, the Alice runtime isn't able to utilize multiple core's,
and is generally slow, but that's separate from semantics.)
Exactly. Futures are also available in both the F# and Scala standard
libraries, even with call-by-need in Scala, and both are built upon more
scalable foundations than GHC's.
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
r***@ps.uni-sb.de
2009-02-22 10:32:03 UTC
Permalink
Post by Jon Harrop
Futures are also available in both the F# and Scala standard
libraries, even with call-by-need in Scala, and both are built upon more
scalable foundations than GHC's.
Well, except that these aren't really futures in the Baker-Hewitt-
Halstaedt-Flanagan-Felleisen sense, because they are not transparent
and thus do not support dataflow synchronisation. It is impossible to
add that as a library because it affects evaluation of all primitives.
Duncan Coutts
2009-02-23 17:00:42 UTC
Permalink
Post by Jon Harrop
Post by Jon Harrop
But Cabal itself is not available as a Debian package, so I have to
$ wget
http://www.haskell.org/cabal/release/cabal-1.6.0.2/Cabal-1.6.0.2.tar.gz
...
And, err, I still don't have a "cabal" program...
Sorry about that. In hindsight of course we would have named them
differently. But you know how it is, the first package gets called the
good name and then the others have to be named differently.

In the next major overhaul we'll re-arrange the package names so that
the 'cabal' command line tool is in the 'cabal' package. We'll stick
the libs in various cabal-* packages.
Post by Jon Harrop
You didn't readwww.haskell.org/cabal, did you? The command line interface
is in package cabal-install. It contains a shell script that downloads the
required dependencies and builds the cabal binary.
$ ./bootstrap.sh
Checking installed packages for ghc-6.8.2...
The Haskell package 'parsec' is required but it is not installed.
If you are using a ghc package provided by your operating system
then install the corresponding packages for 'parsec' and 'network'.
If you built ghc from source with only the core libraries then you
should install these extra packages. You can get them from hackage.
The Haskell package 'parsec' is required but it is not installed.
$ sudo apt-get install libghc6-parsec-dev
...
$ sudo apt-get install libghc6-network-dev
...
Hmm, I'm not sure what more we can do in that circumstance. Do you
want it to work out that you're running debian and tell you the
packages that you need to install? Obviously we want the debian people
to package the thing so you don't have to run any scripts. Do you have
any other suggestions for what we can do in the mean time?
Post by Jon Harrop
Now the bootstrap script from cabal-install at least runs but it installs
the executable in an undesirable place. I'll just install it by hand
myself...
Ah now this is a controversial point. We've been having difficulty
agreeing on where it should install things by default. Some people
will scream if we plonk things in $HOME/bin. Obviously the current
place is sub-optimal. Suggestions most welcome. See also the ticket on
this:
http://hackage.haskell.org/trac/hackage/ticket/289

Thanks for the feedback.

Duncan
Jon Harrop
2009-02-23 18:34:56 UTC
Permalink
Post by Duncan Coutts
Post by Jon Harrop
Post by Jon Harrop
But Cabal itself is not available as a Debian package, so I have to
$ wget
http://www.haskell.org/cabal/release/cabal-1.6.0.2/Cabal-1.6.0.2.tar.gz
...
And, err, I still don't have a "cabal" program...
Sorry about that. In hindsight of course we would have named them
differently. But you know how it is, the first package gets called the
good name and then the others have to be named differently.
In the next major overhaul we'll re-arrange the package names so that
the 'cabal' command line tool is in the 'cabal' package. We'll stick
the libs in various cabal-* packages.
If you just make a deb package most Linux users will be able to do:

apt-get install cabal

and then start using that proprietary installer. Incidentally, cabal seems
to work nicely once it is installed but that installation procedure will be
putting lots of people off.
Post by Duncan Coutts
Post by Jon Harrop
The Haskell package 'parsec' is required but it is not installed.
$ sudo apt-get install libghc6-parsec-dev
...
$ sudo apt-get install libghc6-network-dev
...
Hmm, I'm not sure what more we can do in that circumstance. Do you
want it to work out that you're running debian and tell you the
packages that you need to install?
No. Like millions of other people, I already have an excellent package
manager called apt that offers over a hundred thousand package including
many different programming languages and has GUI tools making it easier to
use. I have a lot of faith in apt because it is mature: I have been using
it myself for ten years and have never had a problem.

Shipping software on Linux without a deb package for apt has bad
connertations for me. I instinctively avoid such software because it tends
to be badly constructed and will be a pain to uninstall. So, like most
Linux users, I will rarely consider installing such software and will
almost certainly avoid it.

So I just want a plain old deb package for apt. No tarballs, no bootstrap
scripts wgetting stuff, just good old deb packages.

Incidentally, how do I uninstall cabal?
Post by Duncan Coutts
Obviously we want the debian people to package the thing so you don't have
to run any scripts.
If this is yours then you should package it properly yourself. I would not
recommend relying upon others to do it for you. Indeed, they have not...
Post by Duncan Coutts
Do you have any other suggestions for what we can do in the mean time?
Make the transition to using your proprietary package manager as painless
and safe as possible by shipping cabal as a deb package for apt. Make sure
your cabal deb package has the correct dependencies on existing Debian
packages (looks like ghc6, libghc6-parsec-dev and libghc6-network-dev). The
other Debian-based distros like Ubuntu will simply copy your package
verbatim and your software will suddenly be much easier to get started with
for millions of Linux users.
Post by Duncan Coutts
Post by Jon Harrop
Now the bootstrap script from cabal-install at least runs but it installs
the executable in an undesirable place. I'll just install it by hand
myself...
Ah now this is a controversial point. We've been having difficulty
agreeing on where it should install things by default. Some people
will scream if we plonk things in $HOME/bin. Obviously the current
place is sub-optimal. Suggestions most welcome.
Submit a package for apt that installs cabal in /usr/bin.
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
Duncan Coutts
2009-02-23 20:07:32 UTC
Permalink
Post by Jon Harrop
Post by Duncan Coutts
Post by Jon Harrop
Post by Jon Harrop
But Cabal itself is not available as a Debian package, so I have to
$ wget
http://www.haskell.org/cabal/release/cabal-1.6.0.2/Cabal-1.6.0.2.tar.gz
...
And, err, I still don't have a "cabal" program...
Sorry about that. In hindsight of course we would have named them
differently. But you know how it is, the first package gets called the
good name and then the others have to be named differently.
In the next major overhaul we'll re-arrange the package names so that
the 'cabal' command line tool is in the 'cabal' package. We'll stick
the libs in various cabal-* packages.
apt-get install cabal
We seem to have been having a problem with the "just" bit of that. We
seem to have some kind of organisational blockage when it comes to
getting packages into debian. Everyone wants them there but nobody
quite knows why they are not and who is responsible. Hopefully we can
connect the right people together and get things moving.
Post by Jon Harrop
and then start using that proprietary installer. Incidentally, cabal seems
to work nicely once it is installed
Great.
Post by Jon Harrop
but that installation procedure will be putting lots of people off.
Yeah. Hopefully once it's part of the platform release it'll be better
because distros will not be able to get away without packaging that
and still claim to support Haskell.
Post by Jon Harrop
Post by Duncan Coutts
Post by Jon Harrop
The Haskell package 'parsec' is required but it is not installed.
$ sudo apt-get install libghc6-parsec-dev
...
$ sudo apt-get install libghc6-network-dev
...
Hmm, I'm not sure what more we can do in that circumstance. Do you
want it to work out that you're running debian and tell you the
packages that you need to install?
No. Like millions of other people, I already have an excellent package
manager called apt that offers over a hundred thousand package including
many different programming languages and has GUI tools making it easier to
use. I have a lot of faith in apt because it is mature: I have been using
it myself for ten years and have never had a problem.
Don't think that we're ignoring the benefits of native system package
managers. On the contrary the Cabal packaging format has been designed
(partly by people with experience of linux distro package management)
so that it is possible to translate into native packages. We've got
the tools to do it, it's really just the mechanics and organisation of
getting it done that's holding us back here. People have managed to
automate this for Gentoo, Arch and Fedora. It's really just debian
where things are not progressing so quickly. Unfortunately of course
that covers a huge proportion of the actual and potential user base.
Very frustrating.
Post by Jon Harrop
Shipping software on Linux without a deb package for apt has bad
connertations for me. I instinctively avoid such software because it tends
to be badly constructed and will be a pain to uninstall. So, like most
Linux users, I will rarely consider installing such software and will
almost certainly avoid it.
So I just want a plain old deb package for apt. No tarballs, no bootstrap
scripts wgetting stuff, just good old deb packages.
That's certainly the goal. If only we could make the wheels turn a
little quicker we'd all be happy.
Post by Jon Harrop
Incidentally, how do I uninstall cabal?
rm -rf ~/.cabal
and
ghc-pkg unregister --user any packages that you installed.

It's not a proper package manager yet, it does not track installed
files. However the plan is not really that end users use this
secondary pseudo package manager anyway. If all were running smoothly
with the debian packaging then all these things would already be
packaged and end users would just apt get everything. Only bleeding
edge hackers would be using cabal to get packages from hackage that
were so fresh that they were not in debian yet.
Post by Jon Harrop
Post by Duncan Coutts
Obviously we want the debian people to package the thing so you don't have
to run any scripts.
If this is yours then you should package it properly yourself. I would not
recommend relying upon others to do it for you. Indeed, they have not...
You make it sound so easy :-) I'm not a debian developer, I have no
special power within debian. How are the rest of us supposed to get
our software included?
Post by Jon Harrop
Post by Duncan Coutts
Do you have any other suggestions for what we can do in the mean time?
Make the transition to using your proprietary package manager as painless
and safe as possible by shipping cabal as a deb package for apt. Make sure
your cabal deb package has the correct dependencies on existing Debian
packages (looks like ghc6, libghc6-parsec-dev and libghc6-network-dev). The
other Debian-based distros like Ubuntu will simply copy your package
verbatim and your software will suddenly be much easier to get started with
for millions of Linux users.
You mean we make debs and put them on our own website or get our
packages included into the debain collection proper? Of course really
we want the latter as otherwise they do not get updated and kept in
sync with the other packages. Maybe it's a better intermediate
solution than just providing a source tarball but it cannot be our
longer term objective.
Post by Jon Harrop
Post by Duncan Coutts
Ah now this is a controversial point. We've been having difficulty
agreeing on where it should install things by default. Some people
will scream if we plonk things in $HOME/bin. Obviously the current
place is sub-optimal. Suggestions most welcome.
Submit a package for apt that installs cabal in /usr/bin.
Well that solves where cabal itself is installed but we have the same
problem for where cabal should install things that the user requests.
Remember this is (by default) a per-user package manager. So when you
do:

$ cabal install xmonad

then you would expect to be able to run:

$ xmonad

which means we have to have installed it (or symlinked it) somewhere
on your $PATH, or you've had to adjust your $PATH. The latter is of
course confusing for new users and the former can tick off experienced
users. Generally the suggestions run along the lines of 1) do no
damage 2) do something sensible 3) tell people what you've done 4) and
how to change it.

Duncan
Manlio Perillo
2009-02-21 20:52:44 UTC
Permalink
Post by Jon Harrop
[...]
Post by Florian Kreidler
It does not link in the required libraries, because you have omitted
the command line switch "--make".
$
ghc --make -fvia-C -O2 -fexcess-precision -optc-ffast-math -optc-O3
-threaded mandelbrot.hs -o mandelbrot
$ time ./mandelbrot +RTS -N16
...
real 3m44.084s
user 26m10.894s
sys 0m1.724s
That is 660x slower than OCaml.
I suspect that you are using too many threads.
Set -N8, if you are on an 8 cores machine.
Post by Jon Harrop
[...]
Manlio Perillo
Jon Harrop
2009-02-22 18:17:28 UTC
Permalink
Post by Manlio Perillo
Post by Jon Harrop
That is 660x slower than OCaml.
I suspect that you are using too many threads.
Set -N8, if you are on an 8 cores machine.
Better but still two orders of magnitude slower than the OCaml:

$ time ./mandelbrot +RTS -N8
real 0m32.623s
user 2m59.399s
sys 0m0.916s

The inexplicably awful performance of that Haskell forced Florian to do a
complete U-turn from:

"I don't want to care about mutable state when I implement a simple
parallel calculation..."

to:

"Here is another version that uses unboxed arrays from package uvector..."

I "kick the tires" of Haskell from time to time but I always end up back at
the same conclusion: these advanced optimizations and libraries are just
toys that cannot be used to solve real problems with competitive
efficiency.

Unfortunately, Haskell is now as good as it gets when it comes to standalone
FPL implementations. If you have real problems to solve, your only viable
options today are JVM- or .NET-based languages. I think that is a great
shame but the complexity of implementing a production-quality FPL from
scratch has just become too great for the academics who implement them.

On the bright side, at least it looks as though the OpenJDK project will get
real tail calls and new FPL implementations can build upon that in order to
inherit a decent foundation. Let's hope something becomes of it...
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
Florian Kreidler
2009-02-23 13:24:43 UTC
Permalink
Post by Jon Harrop
Post by Manlio Perillo
Post by Jon Harrop
That is 660x slower than OCaml.
I suspect that you are using too many threads.
Set -N8, if you are on an 8 cores machine.
$ time ./mandelbrot +RTS -N8
real 0m32.623s
user 2m59.399s
sys 0m0.916s
The inexplicably awful performance of that Haskell forced Florian to do a
"I don't want to care about mutable state when I implement a simple
parallel calculation..."
"Here is another version that uses unboxed arrays from package uvector..."
This is getting ridiculous.

1) Noone else was able to reproduce your claim that the Haskell program
is two orders of magnitude slower than your OCaml program. Thus, the
"inexplicably awful performance of that Haskell" did not force me to
do anything.

2) You are confusing mutable state with strict evaluation. I wouldn't
mind if you did not draw conclusions about me from that.

Goodbye.
Manlio Perillo
2009-02-24 12:00:16 UTC
Permalink
Post by Florian Kreidler
[...]
This is getting ridiculous.
1) Noone else was able to reproduce your claim that the Haskell program
is two orders of magnitude slower than your OCaml program. Thus, the
"inexplicably awful performance of that Haskell" did not force me to
do anything.
I suspect the reason is heavy process usage, that works better on an 8
core machine.
Post by Florian Kreidler
[...]
Manlio Perillo
parnell
2009-02-23 18:04:30 UTC
Permalink
Post by Jon Harrop
I "kick the tires" of Haskell from time to time but I always end up back at
the same conclusion: these advanced optimizations and libraries are just
toys that cannot be used to solve real problems with competitive
efficiency.
Unfortunately, Haskell is now as good as it gets when it comes to standalone
FPL implementations. If you have real problems to solve, your only viable
options today are JVM- or .NET-based languages. I think that is a great
shame but the complexity of implementing a production-quality FPL from
scratch has just become too great for the academics who implement them.
On the bright side, at least it looks as though the OpenJDK project will get
real tail calls and new FPL implementations can build upon that in order to
inherit a decent foundation. Let's hope something becomes of it...
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.http://www.ffconsultancy.com/?u
Why must you spew forth FUD, it only sullies any legitimate issues you
may have found.

Let me remind you of your own findings:

$ time ./mandelbrot +RTS -N7
real 0m0.551s
user 0m2.348s
sys 0m0.008s

Using 7 worker threads, this takes 0.551 seconds.
So, it is 1.6x slower than your best OCaml version (I might add, that
no one else was able to reproduce your results running your OCaml
code).

Florian's version was written in a straight forward functional style
and I would argue is an argument that Haskell's "advanced
optimizations and libraries" perform very well and allow one to
program in an advanced (high level) functional style without giving up
performance.

Parnell
Jon Harrop
2009-02-24 14:41:24 UTC
Permalink
Post by Jon Harrop
$ time ./mandelbrot +RTS -N7
real 0m0.551s
user 0m2.348s
sys 0m0.008s
Using 7 worker threads, this takes 0.551 seconds.
So, it is 1.6x slower than your best OCaml version (I might add, that
no one else was able to reproduce your results running your OCaml
code).
Indeed, Florian only ran his parallel version on 2 cores before concluding
that GHC's support for scalable parallelism was superb. He was wrong. GHC's
support for parallelism swings on a non-concurrent GC that scales poorly
beyond only 4 cores. I have 8 cores today and we'll have 16 cores on the
desktop by the end of this year.
Post by Jon Harrop
Florian's version was written in a straight forward functional style and I
would argue is an argument that Haskell's "advanced optimizations and
libraries" perform very well and allow one to program in an advanced (high
level) functional style without giving up performance.
Total and utter nonsense for two reasons:

1. You have concluded that Haskell performs "very well" despite the fact
that I have proven that it not only performs badly on one core but it does
not even scale to eight cores. Moreover, you need to actually compare it
with fast code, e.g. written in C. In reality, bog standard C code compiled
with GCC (not a good compiler) and without using advanced parallelism (e.g.
Cilk) is 2x faster on one core and over 4x faster on eight cores than
Florian's fastest Haskell and over 1,000x faster than his idiomatic
parallel Haskell. The C code scales a lot better because it does not
introduce unnecessary global synchronization as GHC's GC does and even the
C code leaves a lot of room for optimization (e.g. wait-free work-stealing
queues perform load balancing much more efficiently).

2. You have neglected all of the low-level optimizations that Florian did in
his ugly hack: manual inlining of the original "loop" function, manually
replacing the outer "if i=0 then '*' else ' '" with a different
"loop" function that is hardcoded to return a char instead of the number of
iterations, manual unboxing of complex numbers into separate floating point
values, manual common subexpression elimination of zr2, zi2 and temp, use
of Don Stewart's experimental uvector library to evade the crippling
performance of idiomatic lazy code with immutable data structures.

The triumph you're claiming for Haskell is pure fantasy.
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
parnell
2009-02-24 16:05:12 UTC
Permalink
Post by Jon Harrop
Post by Jon Harrop
$ time ./mandelbrot +RTS -N7
real    0m0.551s
user    0m2.348s
sys     0m0.008s
Using 7 worker threads, this takes 0.551 seconds.
So, it is 1.6x slower than your best OCaml version (I might add, that
no one else was able to reproduce your results running your OCaml
code).
Indeed, Florian only ran his parallel version on 2 cores before concluding
that GHC's support for scalable parallelism was superb. He was wrong. GHC's
support for parallelism swings on a non-concurrent GC that scales poorly
beyond only 4 cores. I have 8 cores today and we'll have 16 cores on the
desktop by the end of this year.
Post by Jon Harrop
Florian's version was written in a straight forward functional style and I
would argue is an argument that Haskell's "advanced optimizations and
libraries" perform very well and allow one to program in an advanced (high
level) functional style without giving up performance.
1. You have concluded that Haskell performs "very well" despite the fact
that I have proven that it not only performs badly on one core but it does
not even scale to eight cores. Moreover, you need to actually compare it
with fast code, e.g. written in C. In reality, bog standard C code compiled
with GCC (not a good compiler) and without using advanced parallelism (e.g.
Cilk) is 2x faster on one core and over 4x faster on eight cores than
Florian's fastest Haskell and over 1,000x faster than his idiomatic
parallel Haskell.
Wait, I thought we we were comparing OCaml, F#, Clojure, and
Pascal? ;)


The last C++ timing I saw in mentioned in this thread was:
"/time Mandelbrot_cpp:
2.544u 0.004s 0:02.57 98.8% 0+0k 0+0io 0pf+0w "

Which Florian's version beats on one Core:

$ time ./mandelbrot +RTS -N1
real 0m2.022s
user 0m2.016s
sys 0m0.000s

The haskell version continues to improve performance with the 10
thread version getting the best time. How is this "not even scale to
eight cores"?
$ time ./mandelbrot +RTS -N2
real 0m1.749s
user 0m3.024s
sys 0m0.000s
$ time ./mandelbrot +RTS -N3
real 0m1.031s
user 0m2.124s
sys 0m0.016s
$ time ./mandelbrot +RTS -N4
real 0m0.942s
user 0m2.844s
sys 0m0.012s
$ time ./mandelbrot +RTS -N5
real 0m0.723s
user 0m2.384s
sys 0m0.052s
$ time ./mandelbrot +RTS -N6
real 0m0.593s
user 0m2.184s
sys 0m0.088s
$ time ./mandelbrot +RTS -N7
real 0m0.551s
user 0m2.348s
sys 0m0.008s
$ time ./mandelbrot +RTS -N8
real 0m0.713s
user 0m2.864s
sys 0m0.072s
$ time ./mandelbrot +RTS -N9
real 0m0.673s
user 0m2.740s
sys 0m0.128s
$ time ./mandelbrot +RTS -N10
real 0m0.496s
user 0m2.320s
sys 0m0.064s
$ time ./mandelbrot +RTS -N11
real 0m0.503s
user 0m2.260s
sys 0m0.076s
$ time ./mandelbrot +RTS -N12
real 0m0.607s
user 0m2.176s
sys 0m0.052s
Post by Jon Harrop
The C code scales a lot better because it does not
introduce unnecessary global synchronization as GHC's GC does and even the
C code leaves a lot of room for optimization (e.g. wait-free work-stealing
queues perform load balancing much more efficiently).
You are using version 6.8.3 of the GHC compiler which does not have
the parallel garbage collector that is included with the 6.10
version. Both concurrent and parallel garbage collection schemes stop
the world for some period of time, so your argument applies to all
functional languages as well as Java and .NET.

From my own experience with other programs running in a multi-core
setting I have found that GHC's parallel GC has minimized the time
spent garbage collecting. That said there are times that a concurrent
GC would perform better.
Post by Jon Harrop
2. You have neglected all of the low-level optimizations that Florian did in
his ugly hack: manual inlining of the original "loop" function, manually
replacing the outer "if i=0 then '*' else ' '" with a different
"loop" function that is hardcoded to return a char instead of the number of
iterations, manual unboxing of complex numbers into separate floating point
Beauty is in the eye of the beholder, I find Florian's code more
"idiomatic" and readable than the original (no offense to Michael).
Post by Jon Harrop
values, manual common subexpression elimination of zr2, zi2 and temp, use
of Don Stewart's experimental uvector library to evade the crippling
performance of idiomatic lazy code with immutable data structures.
All of Haskell is experimental, an improving and maturing technology.
I have found that the uvector interface is quite straight forward to
use, and highly reliable.
Post by Jon Harrop
The triumph you're claiming for Haskell is pure fantasy.
When are you going to finish your LLVM so we can stop arguing about
OCaml, Haskell, etc ... ;)
Post by Jon Harrop
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.http://www.ffconsultancy.com/?u
Duncan Coutts
2009-02-24 20:45:15 UTC
Permalink
Post by parnell
Post by Jon Harrop
The C code scales a lot better because it does not
introduce unnecessary global synchronization as GHC's GC does and even the
C code leaves a lot of room for optimization (e.g. wait-free work-stealing
queues perform load balancing much more efficiently).
You are using version 6.8.3 of the GHC compiler which does not have
the parallel garbage collector that is included with the 6.10
version. Both concurrent and parallel garbage collection schemes stop
the world for some period of time, so your argument applies to all
functional languages as well as Java and .NET.
From my own experience with other programs running in a multi-core
setting I have found that GHC's parallel GC has minimized the time
spent garbage collecting. That said there are times that a concurrent
GC would perform better.
Just a couple points of clarification. GHC 6.10 does indeed have a
parallel collector, though the version in 6.10 does not get great
speedups yet. There have been a number of improvements in ghc HEAD
which make a considerable performance difference in several cases.
http://ghcmutterings.wordpress.com/2009/01/09/46/

A concurrent collector (as the term is usually understood) means the
collector runs concurrently with the mutator. This gives zero pause
times. As I understand it, and from what Jon said, the current .NET
and Java collectors are parallel but not concurrent. They do have
short pause times. So they are in the same class as the current GHC
collector.

A concurrent collector is usually slower overall because more care
needs to be taken so that the mutator and collector do not step on
each others toes. There are cases of course where zero pause time is
more important than overall throughput.

What the current GHC collector does not yet do, and which the .NET and
best JVM collectors probably do do, is to collect the youngest
generation per-cpu without all cpus having to collect at the same
time. The plan is to implement this feature in GHC's collector and it
is expected to improve scalability for GC-heavy programs.

Btw, on the topic of scalability, we've got our first scalability
benchmarks from the Haskell/OpenSPARC project. On trivially
parallelisable benchmarks we're getting near linear scaling up to 64
threads (the total number of hardware threads available).
http://ghcsparc.blogspot.com/2009/02/bugfixes-and-pretty-graphs.html
Note that the graphs are log scale on both axes. While it looks
promising it's early days still of course. There's a lot more to do.

Duncan
Manlio Perillo
2009-02-24 22:05:35 UTC
Permalink
Post by Duncan Coutts
[...]
A concurrent collector (as the term is usually understood) means the
collector runs concurrently with the mutator. This gives zero pause
times. As I understand it, and from what Jon said, the current .NET and
Java collectors are parallel but not concurrent. They do have short
pause times. So they are in the same class as the current GHC collector.
After a quick search:
http://vineetgupta.spaces.live.com/blog/cns!8DE4BDC896BEE1AD!1104.entry
Post by Duncan Coutts
[....]
Manlio Perillo
Jon Harrop
2009-02-25 14:50:44 UTC
Permalink
Post by Duncan Coutts
Just a couple points of clarification. GHC 6.10 does indeed have a
parallel collector, though the version in 6.10 does not get great
speedups yet. There have been a number of improvements in ghc HEAD
which make a considerable performance difference in several cases.
http://ghcmutterings.wordpress.com/2009/01/09/46/
Great.
Post by Duncan Coutts
A concurrent collector (as the term is usually understood) means the
collector runs concurrently with the mutator.
Yes. The classification is split into mostly and fully concurrent where
mostly concurrent collectors only run concurrently some of the time with
some of the threads and perform global synchronization elsewhere such as
before the mark phase.

Note that some people (Boehm) have used the same nomenclature to mean
completely different things.
Post by Duncan Coutts
This gives zero pause times.
Zero pause times cannot be achieved. A fully concurrent collector can reduce
pause times to only those cases when contention requires it and that is
claimed to be excellent for soft real-time applications but most concurrent
collectors are only mostly concurrent and they still incur significant
pauses.
Post by Duncan Coutts
As I understand it, and from what Jon said, the current .NET
and Java collectors are parallel but not concurrent.
No. They are mostly concurrent.
Post by Duncan Coutts
They do have short pause times.
No. They have long pause times for threads that exceed their allocation
quota because they are suspended while the GC takes place but other threads
continue to run concurrently with the collection and see only small pause
times. By long pause times I mean anything up to about 4 seconds.
Post by Duncan Coutts
So they are in the same class as the current GHC collector.
I do not believe so. AFAIK the current GHC collector is only parallel and
suspends all threads whenever any thread invokes a GC (making it impossible
to obtain small pause times by using separate threads) and for the entire
duration of the GC (killing scalability).
Post by Duncan Coutts
A concurrent collector is usually slower overall because more care
needs to be taken so that the mutator and collector do not step on
each others toes.
Fully concurrent collectors are generally believed to be slow but some of
the latest research claims that they have been implemented in JVMs with
competitive performance. Mostly concurrent collectors are certainly not
slow: both Sun's Hotspot JVM and Microsoft's CLR run circles around GHC.
Post by Duncan Coutts
There are cases of course where zero pause time is more important than
overall throughput.
Due to performance considerations, industrial strength implementations
currently take the trade-off offered by a mostly concurrent GC with the
assumption that low-allocating UI threads will be able to run concurrently
with the collection and, consequently, you get minimal pause times exactly
where you want them.

IME that works well on the CLR where our F# for Visualization software uses
worker threads to perform CPU intensive computations (including running the
F# interaction session itself) but the UI remains responsive because it is
run on a separate UI thread. We have never had a problem with stalling due
to garbage collection even though the CLR's GC is only mostly concurrent.
Post by Duncan Coutts
What the current GHC collector does not yet do, and which the .NET and
best JVM collectors probably do do, is to collect the youngest
generation per-cpu without all cpus having to collect at the same
time.
That is really awful. Moreover, I cannot imagine why they would not have
per-thread nurseries when they are much easier to implement that the
functionality they already built?!

The JVM and CLR certainly do not do that, no.
Post by Duncan Coutts
The plan is to implement this feature in GHC's collector and it
is expected to improve scalability for GC-heavy programs.
I expect that will make a huge difference. That is a really bad design for a
language like Haskell where those collections will occur very often.
Post by Duncan Coutts
Btw, on the topic of scalability, we've got our first scalability
benchmarks from the Haskell/OpenSPARC project. On trivially
parallelisable benchmarks we're getting near linear scaling up to 64
threads (the total number of hardware threads available).
http://ghcsparc.blogspot.com/2009/02/bugfixes-and-pretty-graphs.html
Note that the graphs are log scale on both axes. While it looks
promising it's early days still of course. There's a lot more to do.
Is that a performance degradation from 32 to 64 cores on an embarassingly
parallel problem? Why would that occur?
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
Duncan Coutts
2009-02-25 17:19:12 UTC
Permalink
Post by Jon Harrop
Post by Duncan Coutts
As I understand it, and from what Jon said, the current .NET
and Java collectors are parallel but not concurrent.
No. They are mostly concurrent.
Post by Duncan Coutts
They do have short pause times.
No. They have long pause times for threads that exceed their allocation
quota because they are suspended while the GC takes place but other threads
continue to run concurrently with the collection and see only small pause
times. By long pause times I mean anything up to about 4 seconds.
That's interesting and as you say it sounds like a good trade-off for
UI vs computational threads. I expect this would be more tricky to
manage for runtime systems that use light weight threads. Since all
light weight threads on one OS-thread would share the same young
generation.
Post by Jon Harrop
Post by Duncan Coutts
A concurrent collector is usually slower overall because more care
needs to be taken so that the mutator and collector do not step on
each others toes.
Fully concurrent collectors are generally believed to be slow but some of
the latest research claims that they have been implemented in JVMs with
competitive performance. Mostly concurrent collectors are certainly not
slow: both Sun's Hotspot JVM and Microsoft's CLR run circles around GHC.
I look forward to seeing the benchmarks :-)
Post by Jon Harrop
Post by Duncan Coutts
What the current GHC collector does not yet do, and which the .NET and
best JVM collectors probably do do, is to collect the youngest
generation per-cpu without all cpus having to collect at the same
time.
That is really awful. Moreover, I cannot imagine why they would not have
per-thread nurseries when they are much easier to implement that the
functionality they already built?!
It does have per-cpu nurseries. I don't know the exact details but I
think what makes the next step a bit tricky is handling references
between the young generations.
Post by Jon Harrop
The JVM and CLR certainly do not do that, no.
Post by Duncan Coutts
The plan is to implement this feature in GHC's collector and it
is expected to improve scalability for GC-heavy programs.
I expect that will make a huge difference. That is a really bad design for a
language like Haskell where those collections will occur very often.
It's not so much an oversight in the design as an issue of time taken
to complete the work.
Post by Jon Harrop
Post by Duncan Coutts
Btw, on the topic of scalability, we've got our first scalability
benchmarks from the Haskell/OpenSPARC project. On trivially
parallelisable benchmarks we're getting near linear scaling up to 64
threads (the total number of hardware threads available).
http://ghcsparc.blogspot.com/2009/02/bugfixes-and-pretty-graphs.html
Note that the graphs are log scale on both axes. While it looks
promising it's early days still of course. There's a lot more to do.
Is that a performance degradation from 32 to 64 cores on an embarassingly
parallel problem? Why would that occur?
On the first graph, yes it is. We don't know why yet, but I expect
we'll find out.

Duncan
Duncan Coutts
2009-02-25 17:38:32 UTC
Permalink
Post by Duncan Coutts
Post by Jon Harrop
Post by Duncan Coutts
Btw, on the topic of scalability, we've got our first scalability
benchmarks from the Haskell/OpenSPARC project. On trivially
parallelisable benchmarks we're getting near linear scaling up to 64
threads (the total number of hardware threads available).
http://ghcsparc.blogspot.com/2009/02/bugfixes-and-pretty-graphs.html
Note that the graphs are log scale on both axes. While it looks
promising it's early days still of course. There's a lot more to do.
Is that a performance degradation from 32 to 64 cores on an embarassingly
parallel problem? Why would that occur?
On the first graph, yes it is. We don't know why yet, but I expect
we'll find out.
One thing to note about the parfib benchmark is that while the problem
itself is embarrassingly parallel, the way this benchmark is set up is
to find the cost of fine grained parallelism. So it doesn't fork just
64 threads and let each one run independently. It recursively calls
fib and uses par at each level all the way down to some configurable
threshold (eg once the remain recursion depth gets to 5 or so). So
it's actually sparking thousands of very light weight evaluation
contexts. These "par sparks" get evaluated by the hardware threads
using a work stealing queue. What it is an indicator of is how small
sparks can be and yet remain profitable compared to just doing them
serially.

Duncan
Jon Harrop
2009-02-25 18:28:20 UTC
Permalink
Post by Duncan Coutts
Post by Jon Harrop
Post by Duncan Coutts
As I understand it, and from what Jon said, the current .NET
and Java collectors are parallel but not concurrent.
No. They are mostly concurrent.
Post by Duncan Coutts
They do have short pause times.
No. They have long pause times for threads that exceed their allocation
quota because they are suspended while the GC takes place but other
threads continue to run concurrently with the collection and see only
small pause times. By long pause times I mean anything up to about 4
seconds.
That's interesting and as you say it sounds like a good trade-off for
UI vs computational threads.
Yes. I was skeptical of the idea at first but it does seem to work well in
practice.
Post by Duncan Coutts
I expect this would be more tricky to
manage for runtime systems that use light weight threads. Since all
light weight threads on one OS-thread would share the same young
generation.
I haven't seen any information that goes into that much detail. If possible,
the user might address the problem by phrasing their solution such that
work items spawn children with allocation rates similar to themselves. That
way the heavily allocating ones would most likely end up on the same OS
thread and only that thread would be suspended during GCs.

Perhaps the VM could even estimate what the allocation rate of a work item
will be before running it and try to run it on an appropriate thread
accordingly.
Post by Duncan Coutts
Post by Jon Harrop
Post by Duncan Coutts
A concurrent collector is usually slower overall because more care
needs to be taken so that the mutator and collector do not step on
each others toes.
Fully concurrent collectors are generally believed to be slow but some of
the latest research claims that they have been implemented in JVMs with
competitive performance. Mostly concurrent collectors are certainly not
slow: both Sun's Hotspot JVM and Microsoft's CLR run circles around GHC.
I look forward to seeing the benchmarks :-)
Running this benchmark on 8 cores would be a start.
Post by Duncan Coutts
Post by Jon Harrop
Post by Duncan Coutts
What the current GHC collector does not yet do, and which the .NET and
best JVM collectors probably do do, is to collect the youngest
generation per-cpu without all cpus having to collect at the same
time.
That is really awful. Moreover, I cannot imagine why they would not have
per-thread nurseries when they are much easier to implement that the
functionality they already built?!
It does have per-cpu nurseries. I don't know the exact details but I
think what makes the next step a bit tricky is handling references
between the young generations.
Don't you just detect the creation of any such reference and flush
everything?
Post by Duncan Coutts
Post by Jon Harrop
The JVM and CLR certainly do not do that, no.
Post by Duncan Coutts
The plan is to implement this feature in GHC's collector and it
is expected to improve scalability for GC-heavy programs.
I expect that will make a huge difference. That is a really bad design
for a language like Haskell where those collections will occur very
often.
It's not so much an oversight in the design as an issue of time taken
to complete the work.
Right. I saw that Simon Marlow had implemented a fully concurrent GC but
canned it for performance reasons.

I'll be interested to see how development of my GC for my HLVM project
goes... :-)
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
Jon Harrop
2009-02-25 19:41:47 UTC
Permalink
Post by Jon Harrop
Post by Duncan Coutts
Post by Jon Harrop
Fully concurrent collectors are generally believed to be slow but some
of the latest research claims that they have been implemented in JVMs
with competitive performance. Mostly concurrent collectors are certainly
not slow: both Sun's Hotspot JVM and Microsoft's CLR run circles around
GHC.
I look forward to seeing the benchmarks :-)
Running this benchmark on 8 cores would be a start.
Incidentally, I get 436ms for my F# on Mono which is faster than any result
for GHC. I'd expect .NET to be significantly faster still. Moreover, I'd
use the TPL on .NET.
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
Duncan Coutts
2009-02-26 14:13:23 UTC
Permalink
Post by Jon Harrop
Post by Duncan Coutts
Post by Jon Harrop
Post by Duncan Coutts
What the current GHC collector does not yet do, and which the .NET and
best JVM collectors probably do do, is to collect the youngest
generation per-cpu without all cpus having to collect at the same
time.
That is really awful. Moreover, I cannot imagine why they would not have
per-thread nurseries when they are much easier to implement that the
functionality they already built?!
It does have per-cpu nurseries. I don't know the exact details but I
think what makes the next step a bit tricky is handling references
between the young generations.
Don't you just detect the creation of any such reference and flush
everything?
Honestly I don't know enough of the details to comment properly but my
guess is that it's related to overwriting thunks with values, which
happens rather frequently.

Duncan
Jon Harrop
2009-02-26 20:43:35 UTC
Permalink
Post by Duncan Coutts
Post by Jon Harrop
Don't you just detect the creation of any such reference and flush
everything?
Honestly I don't know enough of the details to comment properly but my
guess is that it's related to overwriting thunks with values, which
happens rather frequently.
I see. Strictness analysis to the rescue. :-)
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
George Neuner
2009-02-26 19:24:28 UTC
Permalink
Post by Jon Harrop
Post by Duncan Coutts
Post by Jon Harrop
Post by Duncan Coutts
As I understand it, and from what Jon said, the current .NET
and Java collectors are parallel but not concurrent.
No. They are mostly concurrent.
Post by Duncan Coutts
They do have short pause times.
No. They have long pause times for threads that exceed their allocation
quota because they are suspended while the GC takes place but other
threads continue to run concurrently with the collection and see only
small pause times. By long pause times I mean anything up to about 4
seconds.
That's interesting and as you say it sounds like a good trade-off for
UI vs computational threads.
Yes. I was skeptical of the idea at first but it does seem to work well in
practice.
Post by Duncan Coutts
I expect this would be more tricky to
manage for runtime systems that use light weight threads. Since all
light weight threads on one OS-thread would share the same young
generation.
I haven't seen any information that goes into that much detail. If possible,
the user might address the problem by phrasing their solution such that
work items spawn children with allocation rates similar to themselves. That
way the heavily allocating ones would most likely end up on the same OS
thread and only that thread would be suspended during GCs.
I don't think that's possible with the current CLR or JVM - they seem
to do whatever they please. There are lightweight thread packages
(e.g., on Solaris) that allow assignment of the lightweight thread to
a particular OS thread.

You can do this natively on Windows using fibers. CLR uses fibers
internally, but doesn't give you the same control over them that the
OS API does. I don't know offhand what Mono does on Linux.
Post by Jon Harrop
Perhaps the VM could even estimate what the allocation rate of a work item
will be before running it and try to run it on an appropriate thread
accordingly.
That's an interesting idea.
Post by Jon Harrop
Post by Duncan Coutts
Post by Jon Harrop
Post by Duncan Coutts
A concurrent collector is usually slower overall because more care
needs to be taken so that the mutator and collector do not step on
each others toes.
Fully concurrent collectors are generally believed to be slow but some of
the latest research claims that they have been implemented in JVMs with
competitive performance. Mostly concurrent collectors are certainly not
slow: both Sun's Hotspot JVM and Microsoft's CLR run circles around GHC.
I look forward to seeing the benchmarks :-)
Concurrent collectors aren't slow [performance wise], but they tend to
lag farther behind allocation in the shared heap than do phased
stop-and-? "mostly concurrent" collectors.

Generally, fully concurrent collectors are *always* working - as soon
as one collection is done another is started. With the right data
structures you can mark for collection N while simultaneously sweeping
for collection N-1. This, of course, requires some coordination with
mutator threads so as not to lose new allocations. But it also has
the effect that shared structures are always allocated "black" in the
3-color abstraction, which means that a shared allocation that quickly
becomes junk will survive 2 or 3 collections depending on when it was
allocated. This increases the required memory overhead to avoid
starvation relative to the phased stop-and-? collectors.

Most systems are hybrids anyway, every so often it is useful to
compact the shared heaps - which requires stopping - but you don't
want to do it every time because it costs too much.

George
Jon Harrop
2009-02-24 22:14:55 UTC
Permalink
Post by parnell
Post by Jon Harrop
1. You have concluded that Haskell performs "very well" despite the fact
that I have proven that it not only performs badly on one core but it
does not even scale to eight cores. Moreover, you need to actually
compare it with fast code, e.g. written in C. In reality, bog standard C
code compiled with GCC (not a good compiler) and without using advanced
parallelism (e.g. Cilk) is 2x faster on one core and over 4x faster on
eight cores than Florian's fastest Haskell and over 1,000x faster than
his idiomatic parallel Haskell.
Wait, I thought we we were comparing OCaml, F#, Clojure, and
Pascal? ;)
Ok, I was referring to all languages. I think the numerical performance of
these functional languages leaves a lot to be desired and Haskell being 50%
faster than OCaml in the context of parallelism is not something to sing
about.
Post by parnell
2.544u 0.004s 0:02.57 98.8% 0+0k 0+0io 0pf+0w "
$ time ./mandelbrot +RTS -N1
real 0m2.022s
user 0m2.016s
sys 0m0.000s
That is very surprising and contrary to my own results using just g++ -O2 on
x86 Opteron 2352:

C++: 1.15s
Haskell: 2.02s

What exactly is your setup?
Post by parnell
The haskell version continues to improve performance with the 10
thread version getting the best time. How is this "not even scale to
eight cores"?
$ time ./mandelbrot +RTS -N2
real 0m1.749s
user 0m3.024s
sys 0m0.000s
$ time ./mandelbrot +RTS -N3
real 0m1.031s
user 0m2.124s
sys 0m0.016s
$ time ./mandelbrot +RTS -N4
real 0m0.942s
user 0m2.844s
sys 0m0.012s
$ time ./mandelbrot +RTS -N5
real 0m0.723s
user 0m2.384s
sys 0m0.052s
$ time ./mandelbrot +RTS -N6
real 0m0.593s
user 0m2.184s
sys 0m0.088s
$ time ./mandelbrot +RTS -N7
real 0m0.551s
user 0m2.348s
sys 0m0.008s
$ time ./mandelbrot +RTS -N8
real 0m0.713s
user 0m2.864s
sys 0m0.072s
$ time ./mandelbrot +RTS -N9
real 0m0.673s
user 0m2.740s
sys 0m0.128s
$ time ./mandelbrot +RTS -N10
real 0m0.496s
user 0m2.320s
sys 0m0.064s
$ time ./mandelbrot +RTS -N11
real 0m0.503s
user 0m2.260s
sys 0m0.076s
$ time ./mandelbrot +RTS -N12
real 0m0.607s
user 0m2.176s
sys 0m0.052s
With -N6 you're already within 10% of your best performance but still have
33% of your compute power unused. Also, the Haskell was 2x slower than Cilk
on 1 core and 4x slower on 8 cores for me. So I think that is quite
sublinear scaling even for this GC unintensive test.
Post by parnell
Post by Jon Harrop
The C code scales a lot better because it does not
introduce unnecessary global synchronization as GHC's GC does and even
the C code leaves a lot of room for optimization (e.g. wait-free
work-stealing queues perform load balancing much more efficiently).
You are using version 6.8.3 of the GHC compiler which does not have
the parallel garbage collector that is included with the 6.10
version.
Actually I'm using 6.8.2 but that does not affect my statement: the parallel
GC still stops the world for the entire GC cycle which is a significant
proportion of the run-time of most entire programs.
Post by parnell
Both concurrent and parallel garbage collection schemes stop
the world for some period of time, so your argument applies to all
functional languages as well as Java and .NET.
Sure but they stop for something like synchronizing the mark phase and not
to perform the entire GC cycle as GHC currently does.
Post by parnell
From my own experience with other programs running in a multi-core
setting I have found that GHC's parallel GC has minimized the time
spent garbage collecting. That said there are times that a concurrent
GC would perform better.
Have you studied pause times?
Post by parnell
Post by Jon Harrop
2. You have neglected all of the low-level optimizations that Florian did
in his ugly hack: manual inlining of the original "loop" function,
manually replacing the outer "if i=0 then '*' else ' '" with a different
"loop" function that is hardcoded to return a char instead of the number
of iterations, manual unboxing of complex numbers into separate floating
point
Beauty is in the eye of the beholder, I find Florian's code more
"idiomatic" and readable than the original (no offense to Michael).
Perhaps but high-level code should not contain any of the optimizations I
listed. The code should be more like:

let loop c z i =
if i > max then '*' else
if abs z > 2.0 then ' ' else
loop c (z*z + c) (i + 1)

let iterate j i =
loop ((i - n/2) / 40. - 0.5 + float(j - n/2) / 40. * I) 0

Array.init n (fun j -> String.init n (iterate j))
|> Array.iter print_endline

The complex arithmetic should be inlined and no boxing or even consecutive
allocation should be imposed. Array and string initialization should be
parallelized in the stdlib using divide and conquer. The higher-order
initialization functions should be inlined to avoid making a function call
in the inner loop. Performance should be as good as C but with much easier
parallelism and easy access to kickass data structures.
Post by parnell
When are you going to finish your LLVM so we can stop arguing about
OCaml, Haskell, etc ... ;)
I should have something to ship in the next month or two. Current features
are:

. Unit, bool, int, float types.
. Arrays.
. Algebraic datatypes.
. First-class function pointers.
. Run-time types and generic printing.
. JIT compilation.
. Interactivity.
. Easy C FFI.

I need closures, a GC and a front end and then I'll ship 1.0. Performance is
already several times better than OCaml for most numerical benchmarks and I
haven't even enabled LLVM's optimization passes yet (!).

Provided you stick to core functionality as much as possible, LLVM is
absolutely bloody brilliant. For example, it automates all of the low-level
optimizations I described above to compile struct code (e.g. tuples) into
machine code that runs several times faster than .NET. The only hard part
is finding out which bits of LLVM are unreliable and avoiding them.
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
Florian Kreidler
2009-02-24 15:41:33 UTC
Permalink
Post by Jon Harrop
Post by parnell
Using 7 worker threads, this takes 0.551 seconds.
So, it is 1.6x slower than your best OCaml version (I might add, that
no one else was able to reproduce your results running your OCaml
code).
Indeed, Florian only ran his parallel version on 2 cores before concluding
that GHC's support for scalable parallelism was superb. He was wrong.
Just for the record: I did not claim anything like that. I concluded from
my observations that Haskell "can outperform OCaml when state-of-the-art
libraries are used".
Post by Jon Harrop
Post by parnell
Florian's version was written in a straight forward functional style and I
would argue is an argument that Haskell's "advanced optimizations and
libraries" perform very well and allow one to program in an advanced (high
level) functional style without giving up performance.
Let's see. :)
Post by Jon Harrop
1. You have concluded that Haskell performs "very well" despite the fact
that I have proven that it not only performs badly on one core but it does
not even scale to eight cores. Moreover, you need to actually compare it
with fast code, e.g. written in C. In reality, bog standard C code compiled
with GCC (not a good compiler) and without using advanced parallelism (e.g.
Cilk) is 2x faster on one core and over 4x faster on eight cores than
Florian's fastest Haskell
Standard C does not even know about parallelism. So, this would involve
system specific stuff. I heard some people critizising the need to use third
party libraries. What would they say about this suggestion?
Post by Jon Harrop
and over 1,000x faster than his idiomatic
parallel Haskell.
I still doubt that.
Post by Jon Harrop
The C code scales a lot better because it does not
introduce unnecessary global synchronization as GHC's GC does and even the
C code leaves a lot of room for optimization (e.g. wait-free work-stealing
queues perform load balancing much more efficiently).
Not here. The Haskell implementations that use list processing spent less
than 4% of their time in garbage collecting. The version based on the
uvector library performs no intermediate collections at all.
Post by Jon Harrop
2. You have neglected all of the low-level optimizations that Florian did in
his ugly hack: manual inlining of the original "loop" function,
As in Michael's version: A local end-recursive function, compared to a
while loop in the OCaml version...
Post by Jon Harrop
manually
replacing the outer "if i=0 then '*' else ' '" with a different
"loop" function that is hardcoded to return a char instead of the number of
iterations,
This had no measurable influence on performance. I wouldn't be surprised if
GHC would have done this transformation by itself. (Besides that, my intention
was not to cheat, but to make the program smaller and more pleasant to the
eye. If that is an ugly hack, then I'm fine with it.)
Post by Jon Harrop
manual unboxing of complex numbers into separate floating point
values, manual common subexpression elimination of zr2, zi2 and temp,
Exactly like in the OCaml version...
Post by Jon Harrop
use
of Don Stewart's experimental uvector library to evade the crippling
performance of idiomatic lazy code with immutable data structures.
The reverse is true. The outstanding performance of the uvector library
depends on the immutability of the underlying data structure. Otherwise,
optimisations like list fusion would be infeasible.
Jon Harrop
2009-02-24 22:18:29 UTC
Permalink
Post by Florian Kreidler
Post by Jon Harrop
1. You have concluded that Haskell performs "very well" despite the fact
that I have proven that it not only performs badly on one core but it
does not even scale to eight cores. Moreover, you need to actually
compare it with fast code, e.g. written in C. In reality, bog standard C
code compiled with GCC (not a good compiler) and without using advanced
parallelism (e.g. Cilk) is 2x faster on one core and over 4x faster on
eight cores than Florian's fastest Haskell
Standard C does not even know about parallelism. So, this would involve
system specific stuff. I heard some people critizising the need to use
third party libraries. What would they say about this suggestion?
They can choose from two options:

. Common C extensions that facilitate parallelism and allow the C to run 4x
faster than Haskell on 8 cores.

. Vanilla C on one core that runs hundreds of times faster than vanilla
Haskell without libraries on any number of cores.
Post by Florian Kreidler
Post by Jon Harrop
The C code scales a lot better because it does not
introduce unnecessary global synchronization as GHC's GC does and even
the C code leaves a lot of room for optimization (e.g. wait-free
work-stealing queues perform load balancing much more efficiently).
Not here. The Haskell implementations that use list processing spent less
than 4% of their time in garbage collecting...
On 8 cores, the other 96% becomes 8x faster so GC is expected to take 33% of
the time.
Post by Florian Kreidler
Post by Jon Harrop
2. You have neglected all of the low-level optimizations that Florian did
in his ugly hack: manual inlining of the original "loop" function,
As in Michael's version: A local end-recursive function, compared to a
while loop in the OCaml version...
The while loops are faster.
Post by Florian Kreidler
Post by Jon Harrop
manual unboxing of complex numbers into separate floating point
values, manual common subexpression elimination of zr2, zi2 and temp,
Exactly like in the OCaml version...
Which is also several times slower than C here.
Post by Florian Kreidler
Post by Jon Harrop
use
of Don Stewart's experimental uvector library to evade the crippling
performance of idiomatic lazy code with immutable data structures.
The reverse is true. The outstanding performance of the uvector library
depends on the immutability of the underlying data structure. Otherwise,
optimisations like list fusion would be infeasible.
You call 4x slower than C "outstanding performance"?!
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
Florian Kreidler
2009-02-24 23:20:58 UTC
Permalink
Post by Jon Harrop
Post by Florian Kreidler
Post by Jon Harrop
The C code scales a lot better because it does not
introduce unnecessary global synchronization as GHC's GC does and even
the C code leaves a lot of room for optimization (e.g. wait-free
work-stealing queues perform load balancing much more efficiently).
Not here. The Haskell implementations that use list processing spent less
than 4% of their time in garbage collecting...
On 8 cores, the other 96% becomes 8x faster so GC is expected to take 33% of
the time.
Duncan Coutts has just explained to you why this does not necessarily hold.
Post by Jon Harrop
Post by Florian Kreidler
Post by Jon Harrop
2. You have neglected all of the low-level optimizations that Florian did
in his ugly hack: manual inlining of the original "loop" function,
As in Michael's version: A local end-recursive function, compared to a
while loop in the OCaml version...
The while loops are faster.
Don't be ridiculous.
Post by Jon Harrop
Post by Florian Kreidler
Post by Jon Harrop
manual unboxing of complex numbers into separate floating point
values, manual common subexpression elimination of zr2, zi2 and temp,
Exactly like in the OCaml version...
Which is also several times slower than C here.
I could as well have written: Exactly like in the C version...
Post by Jon Harrop
Post by Florian Kreidler
Post by Jon Harrop
use
of Don Stewart's experimental uvector library to evade the crippling
performance of idiomatic lazy code with immutable data structures.
The reverse is true. The outstanding performance of the uvector library
depends on the immutability of the underlying data structure. Otherwise,
optimisations like list fusion would be infeasible.
You call 4x slower than C "outstanding performance"?!
No. Did I?
Jon Harrop
2009-02-25 14:52:35 UTC
Permalink
Post by Florian Kreidler
Post by Jon Harrop
The while loops are faster.
Don't be ridiculous.
The fastest implementation is a while loop written in C.
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
Jon Harrop
2009-02-26 21:05:50 UTC
Permalink
Post by Jon Harrop
Post by Florian Kreidler
Post by Jon Harrop
The while loops are faster.
Don't be ridiculous.
The fastest implementation is a while loop written in C.
Now here's an interesting thing. C99 offers easy complex arithmetic with no
excuse for poor performance. It permits the following elegant loop:

double sqr(double x) { return x*x; }

double cnorm2(complex z) { return sqr(creal(z)) + sqr(cimag(z)); }

int loop(complex c) {
complex z=c;
int i=1;
while (cnorm2(z) <= 4.0 && i++ < max_i)
z = z*z + c;
return i;
}

Compared to your nasty Haskell:

pixel :: Int -> Int -> Char
pixel x y = loop 0.0 0.0 1
where
ci, cr :: Float
ci = fromIntegral y / 40.0
cr = fromIntegral x / 40.0 - 0.5
loop zi zr i
| i > max_iterations = '*'
| zi2 + zr2 > 4.0 = ' '
| otherwise = loop zi' zr' (i + 1)
where temp = zr * zi
zr2 = zr * zr
zi2 = zi * zi
zi' = temp + temp + ci
zr' = zr2 - zi2 + cr

GCC sucks donkey brains through a straw taking a pitiful 5.7s to run this
(even slower than Haskell!), unfortunately, but the good news is the
llvm-gcc (which is based upon the kickass LLVM compilation infrastructure
extravaganza) runs circles around almost all other code in any language by
generating code that runs in only 1.3s. That's within 20% of the best
possible performance I've been able to achieve with hand tweaking.
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
unknown
2009-02-19 17:45:27 UTC
Permalink
Post by unknown
Pentium 4, 3.2 GHz
Windows XP
2.475 sec. -- F#
3.276 sec. -- Clojure (java -server)
5.921 sec. -- OCaml (ocamlopt)
5.156 sec. -- FreePascal
I modified the Clojure program to use pmap, but I had little
hope that the result would be faster.

1.781 sec. -- Clojure with pmap (java -server)


I think the Pentium 4 that's running this must be single core.
So how can pmap speed things up?


(def runs 100)
(def max_iterations 1000)

(defn iter [ci cr]
(let [max_iter (int max_iterations)
ci (double ci)
cr (double cr)]
(loop [zi (double ci) zr (double cr) i (int 1)]
(if (<= i max_iter)
(let [zr2 (* zr zr) zi2 (* zi zi)]
(if (<= (+ zr2 zi2) (double 4.0))
(recur (+ (* (* zr zi) (double 2.0)) ci)
(+ (- zr2 zi2) cr)
(inc i))
i))
0))))

(defn mand [n]
(doseq [y (range -39 39)]
(when (= 1 n) (print "\n"))
(let [ counts
(pmap
(fn [x] (iter (/ x 40.0) (- (/ y 40.0) 0.5)))
(range -39 39)) ]
(when (= 1 n) (doseq [i counts]
(print (if (= 0 i) "*" " "))))
)))


(time
(doseq [i (range 1 (inc runs))]
(mand i)))
Loading...