[clean-list] Why drastically different performance of these two implementations?

Wed Dec 19 20:00:39 MET 2007

I would like to understand why 'iconvolve' below performs so much worse 
than 'convolve'.  How might one speed up iconvolve, while still 
preserving its pattern of traversal of arrays a and b?

David

-----

module myconvolve

import StdEnv

// When compiled with clm -fusion, this implementation proves to be
// marginally faster than gcc -O2 compiled C using nested for loops
// as in:
//   for(i=0; i++, i<na){
//     for(j=0; j++, j<nb){
//       ab[i+j] += a[i] * b[j];
//     }
//   }
convolve :: (ar e) (ar e) -> (ar e) | Array ar e & Arith e
convolve a b = {let start = max 0 (i-m)
                    end   = min i n
                in
                        foldl (+) zero [ a.[i-k] * b.[k] \\ k <- 
[start..end] ]
                \\ i <- [0..(m+n)]}
where
    m = size a - 1
    n = size b - 1

// Why does this implementation (imitating the C algorithm above) run 
30x slower?
iconvolve :: !{#Real} !{#Real} -> {#Real}
iconvolve a b = iconvolve` ab 0 0 0
where
    na = size a
    nb = size b
    nab = na + nb - 1
    ab = createArray nab 0.0
    iconvolve` ab=:{[k]=ab_k} i j k  // constraint k==i+j is maintained
    | i >= na   = ab
    | j >= nb   = iconvolve` ab (i+1) 0 (i+1)
    | otherwise = iconvolve` ab` i (j+1) (i+j+1)
                  where
                      ab` = {ab & [k] = ab_k + a.[i] * b.[j]}

Start :: Real
//Start = (iconvolve a b).[m+n-2]
Start = (convolve a b).[m+n-2]
where
    m = 10000
    n = 6000
    a :: {#Real}
    a = {fromInt x \\ x <- [1..m]}
    b :: {#Real}
    b = {fromInt x \\ x <- [1..n]}