共用方式為


F# Performance Tweaking

Jomo Fisher—I’ve enjoyed reading the blog series on F# and game development here. Joh had posted an example of F# of using RK4 and showed some performance numbers comparing F# to C++ and OCaml. Don Syme had been reading too and asked if anyone on the F# team wanted to take a stab to see if they could optimize Joh’s code any further. I thought it would be fun. It turns out there were a few small changes that make a significant performance improvement without sacrificing immutability or other nice functionalness.

The example code in this article was compiled with the 1.9.6.2 version of the F# compiler.

Joh probably already accounted for this, but it’s worth pointing out that there is a huge performance difference between Debug and Release builds for this example. On my machine I could measure a 3x performance difference between the two.

Considering just Release builds, I was able to get another 2-3x improvement by making several simple changes:

- Change a few functions to return struct instead of tuple. For this, I created a ps (e.g. Position-Speed) struct and a sa (e.g. Speed-Acceleration) struct.

- I manually lifted a few computations out of the outer similar loop. I made (euler_implicit_opt accel_opt) and (runge_kutta_4_opt (adapt_opt accel_opt))into values.

- I changed the simulate function to use OptimizedClosures.FastFunc3.

This last part especially needs some explaining. Closure optimization can be used when a function will be called many times. It allows F# to do a bit of expensive work up front that will later be amortized across the many calls to the function. Below is the simulate function using optimized closures. I’ve bolded the affected lines.

let inline simulate_opt intg_func t0 t_fin delta pos0 speed0 =

    let oc = OptimizedClosures.FastFunc3<_,_,_,_>.Adapt(intg_func)

    let rec repeat t0 pos0 speed0 =

        if t0 > t_fin then (pos0, speed0)

        else

            let t1 = t0 + delta

            let ps:ps<_> = oc.Invoke(pos0,speed0,delta)

            repeat t1 ps.pos ps.speed

    repeat t0 pos0 speed0

Unfortunately, this is not something that you can stick everywhere and just get a benefit (otherwise, the compiler could just inject it for you). You should definitely profile before and after. Here are my final performance numbers:

Optimized---

Euler single: 2.529253

Euler multi: 1.328133

RK4 single: 2.383238

RK4 multi: 1.241124

Baseline---

Euler single: 7.672767

Euler multi: 3.922392

RK4 single: 4.563456

RK4 multi: 2.450245

Here’s the entire code. The optimized functions end in _opt. Thanks Joh for an awesome diversion!

#light

open System

let run_time_func (func: unit -> 'a) =

    let time0 = System.DateTime.Now

    let res = func ()

    let diff0 = System.DateTime.Now - time0

    (diff0, res)

let map_parallel func items =

    let tasks =

        seq {

             for i in items -> async {

                    return (func i)

                    }

            }

    Async.Run (Async.Parallel tasks)

let gravity =

     -9.81

let inline spring pos =

    let k = 100.0 in

    -k * pos

let inline drag k pos speed = -k * speed

type ps<'num> = struct

    val pos:'num

    val speed:'num

    new(pos,speed) = {pos=pos;speed=speed}

end

type sa<'num> = struct

    val speed:'num

    val accel:'num

    new(speed,accel) = {speed=speed;accel=accel}

end

let inline euler_implicit accel_func pos speed delta =

    let speed2 = speed + delta * accel_func pos speed in

        pos + delta * speed2, speed2

let inline euler_implicit_opt accel_func pos speed delta =

    let speed2 = speed + delta * accel_func pos speed in

        ps(pos + delta * speed2, speed2)

let inline adapt (f: 'num -> 'num -> 'num) =

    fun pos speed ->

        let accel = f pos speed in

            speed, accel

let inline adapt_opt (f: 'num -> 'num -> 'num) =

    fun pos speed ->

        let accel = f pos speed in

            sa(speed, accel)

           

let inline runge_kutta_4 deriv_func pos speed delta =

    let half_delta = 0.5 * delta

    let s1, a1 = deriv_func pos speed

    let s2, a2 = deriv_func (pos + half_delta * s1) (speed + half_delta * a1)

    let s3, a3 = deriv_func (pos + half_delta * s2) (speed + half_delta * a2)

    let s4, a4 = deriv_func (pos + delta * s3) (speed + delta * a3)

    let pos1 = pos + delta/6.0*(s1 + 2.0*s2 + 2.0*s3 + s4)

    let speed1 = speed + delta/6.0*(a1 + 2.0*a2 + 2.0*a3 + a4) in

        pos1, speed1

let inline runge_kutta_4_opt deriv_func pos speed delta =

    let half_delta = 0.5 * delta

    let sa1:sa<_> = deriv_func pos speed

    let sa2 = deriv_func (pos + half_delta * sa1.speed) (speed + half_delta * sa1.accel)

    let sa3 = deriv_func (pos + half_delta * sa2.speed) (speed + half_delta * sa2.accel)

    let sa4 = deriv_func (pos + delta * sa3.speed) (speed + delta * sa3.speed)

    let pos1 = pos + delta/6.0*(sa1.speed + 2.0*sa2.speed + 2.0*sa3.speed + sa4.speed)

    let speed1 = speed + delta/6.0*(sa1.accel + 2.0*sa2.accel + 2.0*sa3.accel + sa4.accel) in

        ps(pos1, speed1)

let inline simulate intg_func t0 t_fin delta pos0 speed0 =

    let rec repeat t0 pos0 speed0 =

        if t0 > t_fin then (pos0, speed0)

        else

            let t1 = t0 + delta

            let pos1, speed1 = intg_func pos0 speed0 delta

            repeat t1 pos1 speed1

    repeat t0 pos0 speed0

   

let inline simulate_opt intg_func t0 t_fin delta pos0 speed0 =

    let oc = OptimizedClosures.FastFunc3<_,_,_,_>.Adapt(intg_func)

    let rec repeat t0 pos0 speed0 =

        if t0 > t_fin then (pos0, speed0)

        else

            let t1 = t0 + delta

            let ps:ps<_> = oc.Invoke(pos0,speed0,delta)

            repeat t1 ps.pos ps.speed

    repeat t0 pos0 speed0

   

let initial_states = [ for s in 1..1000 -> (float s * 1.0, 0.0) ]

let t0 = 0.0

let t_fin = 1000.0

let delta = 0.025

let accel pos speed = (drag 1.0) pos speed + spring pos + gravity in

let accel_opt pos speed = (drag 1.0) pos speed + spring pos + gravity in

let ef = (euler_implicit_opt accel_opt)

let rk4f = (runge_kutta_4_opt (adapt_opt accel_opt))

// Changes

// - Change euler_implicit_opt and runge_kutta_4_opt to return a struct (ps)

// - Manually lift (euler_implicit_opt accel) and (runge_kutta_4_opt (adapt accel)) out of loop

// - Change adapt_opt to return struct (sa)

// - Use optimized closure in simulate_opt

printfn "Optimized---"

for i in 0..0 do

    let (run_time, res) = run_time_func (fun () -> initial_states |> List.map(fun (x,y) -> simulate_opt ef t0 t_fin (0.25*delta) x y))

    printfn "Euler single: %f" run_time.TotalSeconds;

    let (run_time, res) = run_time_func (fun () -> initial_states |> map_parallel(fun (x,y) -> simulate_opt ef t0 t_fin (0.25*delta) x y))

    printfn "Euler multi: %f" run_time.TotalSeconds;

    let (run_time, res) = run_time_func (fun () -> initial_states |> List.map(fun (x,y) -> simulate_opt rk4f t0 t_fin delta x y))

    printfn "RK4 single: %f" run_time.TotalSeconds;

    let (run_time, res) = run_time_func (fun () -> initial_states |> map_parallel(fun (x,y) -> simulate_opt rk4f t0 t_fin delta x y))

    printfn "RK4 multi: %f" run_time.TotalSeconds;

   

printfn "Baseline---"

for i in 0..0 do

    let (run_time, res) = run_time_func (fun () -> initial_states |> List.map(fun (x,y) -> simulate (euler_implicit accel) t0 t_fin (0.25*delta) x y))

    printfn "Euler single: %f" run_time.TotalSeconds;

    let (run_time, res) = run_time_func (fun () -> initial_states |> map_parallel(fun (x,y) -> simulate (euler_implicit accel) t0 t_fin (0.25*delta) x y))

    printfn "Euler multi: %f" run_time.TotalSeconds;

    let (run_time, res) = run_time_func (fun () -> initial_states |> List.map(fun (x,y) -> simulate (runge_kutta_4 (adapt accel)) t0 t_fin delta x y))

    printfn "RK4 single: %f" run_time.TotalSeconds;

    let (run_time, res) = run_time_func (fun () -> initial_states |> map_parallel(fun (x,y) -> simulate (runge_kutta_4 (adapt accel)) t0 t_fin delta x y))

    printfn "RK4 multi: %f" run_time.TotalSeconds;

done;

        

              

 

T his posting is provided "AS IS" with no warranties, and confers no rights.

Comments

  • Anonymous
    September 16, 2008
    What?!  You have a physical constant and no units of measure?! :)

  • Anonymous
    September 16, 2008
    Units of measure would fit nicely in this code. But, I don't know which version of the compiler Joh is using and they weren't available until 1.9.6.0. They won't affect performance anyway.

  • Anonymous
    September 16, 2008
    I managed to get another 20% off the RK4 time by singling out the delta/6.0 calculation in runge_kutta_4_opt.

  • Anonymous
    September 17, 2008
    I'm pleased to see my experiments are attracting attention :) I had somehow missed the availability of structs which made me wonder how fast 3d computation could be achieved in F#. Good catch, brilsmurf. I would not have thought it would make such an impact.

  • Anonymous
    September 17, 2008
    This post is entirely devoted to shameless plugs and ‘Me Too’ blogging. &#160; The Stack Overflow public