Julia By Example | Source Examples

Julia is a high-level, high-performance dynamic programming language for technical computing. This site is a non official series of examples of Julia, for more details see about.

quine.jl

This script prints a string identical to it's own source code, see here.

github.com/karbarcca/Rosetta-Julia/blob/master/src/Quine.jl: In Julia, $x in a string literal interpolates the value of the variable into the string. $(expression) evaluates the expression and interpolates the result into the string. Normally, the string value "hi\tworld" would be inserted without quotation marks and with a literal tab

The repr function returns a string value that contains quotation marks and in which the literal tab is replaced by the characters \t. When the result of the repr function is interpolated, the result is what you would type into your code to create that string literal.

x="println(\"x=\$(repr(x))\\n\$x\")"
println("x=$(repr(x))\n$x")

bubblesort.jl

import Base.Sort
immutable BubbleSortAlg <: Sort.Algorithm end
const BubbleSort = BubbleSortAlg()

function Base.sort!(v::AbstractVector, lo::Int, hi::Int, ::BubbleSortAlg, o::Sort.Ordering)
    while true
        clean = true
        for i = lo:hi-1
            if Sort.lt(o, v[i+1], v[i])
                v[i+1], v[i] = v[i], v[i+1]
                clean = false
            end
        end
        clean && break
    end
    return v
end

enum.jl

Some description here.

macro enum(T,syms...)
    blk = quote
        immutable $(esc(T))
            n::Int32
            $(esc(T))(n::Integer) = new(n)
        end
        Base.show(io::IO, x::$(esc(T))) = print(io, $syms[x.n+1])
        Base.show(io::IO, x::Type{$(esc(T))}) = print(io, $(string("enum ", T, ' ', '(', join(syms, ", "), ')')))
    end
    for (i,sym) in enumerate(syms)
        push!(blk.args, :(const $(esc(sym)) = $(esc(T))($(i-1))))
    end
    push!(blk.args, :nothing)
    blk.head = :toplevel
    return blk
end

modint.jl

module ModInts

importall Base

immutable ModInt{n} <: Integer
    k::Int
    ModInt(k) = new(mod(k,n))
end

-{n}(a::ModInt{n}) = ModInt{n}(-a.k)
+{n}(a::ModInt{n}, b::ModInt{n}) = ModInt{n}(a.k+b.k)
-{n}(a::ModInt{n}, b::ModInt{n}) = ModInt{n}(a.k-b.k)
*{n}(a::ModInt{n}, b::ModInt{n}) = ModInt{n}(a.k*b.k)

convert{n}(::Type{ModInt{n}}, i::Int) = ModInt{n}(i)
promote_rule{n}(::Type{ModInt{n}}, ::Type{Int}) = ModInt{n}

show{n}(io::IO, k::ModInt{n}) = print(io, "$(k.k) mod $n")
showcompact(io::IO, k::ModInt) = print(io, k.k)

end # module

queens.jl

Example of the 8 Queens Puzzle.

addqueen(queens::Array{Vector{Int}}, queen::Vector{Int}) = push!(copy(queens), queen)

hitsany(queen::Vector{Int}, queens::Array{Vector{Int}}) = any(map(x->hits(queen, x), queens))
hits(a::Array{Int}, b::Array{Int}) = any(a .== b) || abs(a-b)[1] == abs(a-b)[2]

function solve(x, y, n, d=Array(Vector{Int}, 0))
  if n == 0
    return d
  end
  for px = 1:x
    for py = 1:y
      if !hitsany([px, py], d)
        s = solve(x, y, n-1, addqueen(d, [px, py]))
        s != nothing && return s
      end
    end
  end
  return nothing
end

preduce.jl

importall Base

# figure 5.2 from principles of parallel programming, ported to julia.
# sum a vector using a tree on top of local reductions.
function sum(v::DArray)
    P = procs(v)
    nodeval = { RemoteRef(p) for p=P }
    answer = RemoteRef()
    np = numel(P)

    for i=0:np-1
        @spawnat P[i+1] begin
            stride=1
            tally = sum(localpart(v))
            while stride < np
                if i%(2*stride) == 0
                    tally = tally + take(nodeval[i+stride])
                    stride = 2*stride
                else
                    put(nodeval[i], tally)
                    break
                end
            end
            if i==0
                put(answer, tally)
            end
        end
    end
    fetch(answer)
end

function reduce(f, v::DArray)
    mapreduce(fetch, f,
              { @spawnat p reduce(f,localpart(v)) for p = procs(v) })
end

# possibly-useful abstraction:

type RefGroup
    refs::Array{RemoteRef,1}

    RefGroup(P) = new([ RemoteRef(p) for p=P ])
end

getindex(r::RefGroup, i) = fetch(r.refs[i])
setindex!(r::RefGroup, v, i) = put(r.refs[i], v)

time.jl

module Time
export TimeDelta

import Base.show, Base.+, Base.-, Base.convert, Base.promote_rule

immutable TimeDelta{p}
    v::Int64
end

const PREFIXES = [
    "yocto", "zepto", "atto", "femto", "pico", "nano", "micro", "milli",
    "", "kilo", "mega", "giga", "tera", "peta", "exa", "zetta", "yotta",
]
const ZERO_INDEX = 9
const MAX_INDEX = 17

function show{p}(io::IO, x::TimeDelta{p})
    k = max(1,min(MAX_INDEX,fld(p,3)+ZERO_INDEX))
    r = p-3(k-ZERO_INDEX)
    prefix = PREFIXES[k]
    if r == 0
        s = x.v == 1 ? "" : "s"
        print(io, "$(x.v) $(prefix)second$s")
    elseif r > 0
        print(io, "$(x.v*10^r) $(prefix)seconds")
    else
        print(io, "$(x.v/10^-r) $(prefix)seconds")
    end
end

convert{p,q}(::Type{TimeDelta{p}}, x::TimeDelta{q}) =
    TimeDelta{p}(p <= q ? x.v*10^(q-p) : div(x.v,10^(p-q)))

promote_rule{p,q}(::Type{TimeDelta{p}}, ::Type{TimeDelta{q}}) = TimeDelta{min(p,q)}

-{p}(x::TimeDelta{p}) = TimeDelta{p}(-x.v)
+{p}(x::TimeDelta{p}, y::TimeDelta{p}) = TimeDelta{p}(x.v+y.v)
-{p}(x::TimeDelta{p}, y::TimeDelta{p}) = TimeDelta{p}(x.v-y.v)

+(x::TimeDelta, y::TimeDelta) = +(promote(x,y)...)
-(x::TimeDelta, y::TimeDelta) = -(promote(x,y)...)

end # module

ndgrid.jl

ndgrid(v::AbstractVector) = copy(v)

function ndgrid{T}(v1::AbstractVector{T}, v2::AbstractVector{T})
    m, n = length(v1), length(v2)
    v1 = reshape(v1, m, 1)
    v2 = reshape(v2, 1, n)
    (repmat(v1, 1, n), repmat(v2, m, 1))
end

function ndgrid_fill(a, v, s, snext)
    for j = 1:length(a)
        a[j] = v[div(rem(j-1, snext), s)+1]
    end
end

function ndgrid{T}(vs::AbstractVector{T}...)
    n = length(vs)
    sz = map(length, vs)
    out = ntuple(n, i->Array(T, sz))
    s = 1
    for i=1:n
        a = out[i]::Array
        v = vs[i]
        snext = s*size(a,i)
        ndgrid_fill(a, v, s, snext)
        s = snext
    end
    out
end

meshgrid(v::AbstractVector) = meshgrid(v, v)

function meshgrid{T}(vx::AbstractVector{T}, vy::AbstractVector{T})
    m, n = length(vy), length(vx)
    vx = reshape(vx, 1, n)
    vy = reshape(vy, m, 1)
    (repmat(vx, m, 1), repmat(vy, 1, n))
end

function meshgrid{T}(vx::AbstractVector{T}, vy::AbstractVector{T},
                     vz::AbstractVector{T})
    m, n, o = length(vy), length(vx), length(vz)
    vx = reshape(vx, 1, n, 1)
    vy = reshape(vy, m, 1, 1)
    vz = reshape(vz, 1, 1, o)
    om = ones(Int, m)
    on = ones(Int, n)
    oo = ones(Int, o)
    (vx[om, :, oo], vy[:, on, oo], vz[om, on, :])
end

lru_test.jl

using LRUExample

TestLRU = LRUExample.UnboundedLRU{ASCIIString, ASCIIString}()
TestBLRU = LRUExample.BoundedLRU{ASCIIString, ASCIIString}(1000)

get_str(i) = ascii(vcat(map(x->[x>>4; x&0x0F], reinterpret(Uint8, [int32(i)]))...))

isbounded{L<:LRUExample.LRU}(::Type{L}) = any(map(n->n==:maxsize, L.names))
isbounded{L<:LRUExample.LRU}(l::L) = isbounded(L)

nmax = int(logspace(2, 5, 4))

function lrutest()
    #println("LRU consistency tests")
    for lru in (TestLRU,TestBLRU)
        for n in nmax
            empty!(lru)
            #@printf("  %s, %d items\n", lru, n)
            #print("    Simple eviction: ")
            for i in 1:n
                str = get_str(i)
                lru[str] = str
                @assert lru.q[1].v == str
                if isbounded(lru) && length(lru) >= lru.maxsize
                    tailstr = get_str(i-lru.maxsize+1)
                    @assert lru.q[end].v == tailstr
                end
            end
            #println("pass")

            #print("    Lookup, random access: ")
            for i in 1:n
                str = get_str(rand(1:n))
                if haskey(lru, str) # the bounded LRUs can have cache misses
                    blah = lru[str]
                    @assert lru.q[1].v == blah
                end
            end
            #println("pass")
        end
        empty!(lru)
    end
end

lrutest()

staged.jl

function add_method(gf, an, at, body)
    argexs = { Expr(symbol("::"), an[i], at[i]) for i=1:length(an) }
    def = quote
        let __F__=($gf)
            function __F__($(argexs...))
                $body
            end
        end
    end
    eval(def)
end

macro staged(fdef)
    if !isa(fdef,Expr) || !is(fdef.head,:function)
        error("@staged: expected method definition")
    end
    fname = fdef.args[1].args[1]
    argspec = fdef.args[1].args[2:end]
    argnames = map(x->(isa(x,Expr) ? x.args[1] : x), argspec)
    qargnames = map(x->Expr(:quote,x), argnames)
    fbody = fdef.args[2]
    @gensym gengf argtypes expander genbody
    quote
        let ($gengf)
            global ($fname)   # should be "outer"
            local ($expander)
            function ($expander)($(argnames...))
                $fbody
            end
            ($gengf)() = 0    # should be initially empty GF
            function ($fname)($(argspec...))
                ($argtypes) = typeof(tuple($(argnames...)))
                if !method_exists($gengf, $argtypes)
                    ($genbody) = apply(($expander), ($argtypes))
                    add_method($gengf, {$(qargnames...)},
                                   $argtypes, $genbody)
                end
                return ($gengf)($(argnames...))
            end
        end
    end
end

# example

@staged function nloops(dims::Tuple)
    names = map(x->gensym(), dims)
    ex = quote
        println([$(names...)])
    end
    for i = 1:length(dims)
        ex = quote
            for $(names[i]) in dims[$i]
                $ex
            end
        end
    end
    ex
end

plife.jl

function life_rule(old)
    m, n = size(old)
    new = similar(old, m-2, n-2)
    for j = 2:n-1
        for i = 2:m-1
            nc = +(old[i-1,j-1], old[i-1,j], old[i-1,j+1],
                   old[i  ,j-1],             old[i  ,j+1],
                   old[i+1,j-1], old[i+1,j], old[i+1,j+1])

            new[i-1,j-1] = (nc == 3 ? 1 :
                            nc == 2 ? old[i,j] :
                            0)
        end
    end
    new
end

function life_step(d)
    DArray(size(d),procs(d)) do I
        # fetch neighborhood - toroidal boundaries
        top   = mod(first(I[1])-2,size(d,1))+1
        bot   = mod( last(I[1])  ,size(d,1))+1
        left  = mod(first(I[2])-2,size(d,2))+1
        right = mod( last(I[2])  ,size(d,2))+1

        old = Array(Bool, length(I[1])+2, length(I[2])+2)
        @sync begin
            @async old[1      , 1      ] = d[top , left]   # left side
            @async old[2:end-1, 1      ] = d[I[1], left]
            @async old[end    , 1      ] = d[bot , left]
            @async old[1      , 2:end-1] = d[top , I[2]]
            @async old[2:end-1, 2:end-1] = d[I[1], I[2]]   # middle
            @async old[end    , 2:end-1] = d[bot , I[2]]
            @async old[1      , end    ] = d[top , right]  # right side
            @async old[2:end-1, end    ] = d[I[1], right]
            @async old[end    , end    ] = d[bot , right]
        end

        life_rule(old)
    end
end

function plife(m, n)
    w = Window("parallel life", n, m)
    c = Canvas(w)
    pack(c)
    done = false
    c.mouse.button1press = (c,x,y)->(done=true)
    cr = getgc(c)

    grid = DArray(I->convert(Array{Bool,2},randbool(map(length,I))), (m, n), workers())

    last = time(); f = 1
    while !done
        @async begin
            img = convert(Array{Uint32,2},grid) .* 0x00ffffff
            set_source_surface(cr, CairoRGBSurface(img), 0, 0)
            paint(cr)
            reveal(c)
        end
        t = time()
        if t-last > 2
            println("$(f/(t-last)) FPS")
            last = t; f = 0
        end
        grid = life_step(grid)
        f += 1
        sleep(0.01)
    end
end

quaternion.jl

module Quaternions

import Base: convert, promote_rule, show, real, imag, conj, abs, abs2, inv, +, -, /, *

immutable Quaternion{T<:Real} <: Number
    q0::T
    q1::T
    q2::T
    q3::T
end

Quaternion(q0::Real,q1::Real,q2::Real,q3::Real) = Quaternion(promote(q0,q1,q2,q3)...)

convert{T}(::Type{Quaternion{T}}, x::Real) =
    Quaternion(convert(T,x), zero(T), zero(T), zero(T))
convert{T}(::Type{Quaternion{T}}, z::Complex) =
    Quaternion(convert(T,real(z)), convert(T,imag(z)), zero(T), zero(T))
convert{T}(::Type{Quaternion{T}}, z::Quaternion) =
    Quaternion(convert(T,z.q0), convert(T,z.q1), convert(T,z.q2), convert(T,z.q3))

promote_rule{s,S}(::Type{MathConst{s}}, ::Type{Quaternion{S}}) = Quaternion{S}
promote_rule{T,S}(::Type{Complex{T}}, ::Type{Quaternion{S}}) = Quaternion{promote_type(T,S)}
promote_rule{S}(::Type{Bool}, ::Type{Quaternion{S}}) = Quaternion{S}
promote_rule{T<:Real,S}(::Type{T}, ::Type{Quaternion{S}}) = Quaternion{promote_type(T,S)}
promote_rule{T,S}(::Type{Quaternion{T}}, ::Type{Quaternion{S}}) = Quaternion{promote_type(T,S)}

function show(io::IO, z::Quaternion)
    pm(x) = x < 0 ? " - $(-x)" : " + $x"
    print(io, z.q0, pm(z.q1), "i", pm(z.q2), "j", pm(z.q3), "k")
end

real(z::Quaternion) = z.q0
imag(z::Quaternion) = z.q1

conj(z::Quaternion) = Quaternion(z.q0, -z.q1, -z.q2, -z.q3)
abs(z::Quaternion) = sqrt(z.q0*z.q0 + z.q1*z.q1 + z.q2*z.q2 + z.q3*z.q3)
abs2(z::Quaternion) = z.q0*z.q0 + z.q1*z.q1 + z.q2*z.q2 + z.q3*z.q3
inv(z::Quaternion) = conj(z)/abs2(z)

(-)(z::Quaternion) = Quaternion(-z.q0, -z.q1, -z.q2, -z.q3)

(/)(z::Quaternion, x::Real) = Quaternion(z.q0/x, z.q1/x, z.q2/x, z.q3/x)

(+)(z::Quaternion, w::Quaternion) = Quaternion(z.q0 + w.q0, z.q1 + w.q1,
                                               z.q2 + w.q2, z.q3 + w.q3)
(-)(z::Quaternion, w::Quaternion) = Quaternion(z.q0 - w.q0, z.q1 - w.q1,
                                               z.q2 - w.q2, z.q3 - w.q3)
(*)(z::Quaternion, w::Quaternion) = Quaternion(z.q0*w.q0 - z.q1*w.q1 - z.q2*w.q2 - z.q3*w.q3,
                                               z.q0*w.q1 + z.q1*w.q0 + z.q2*w.q3 - z.q3*w.q2,
                                               z.q0*w.q2 - z.q1*w.q3 + z.q2*w.q0 + z.q3*w.q1,
                                               z.q0*w.q3 + z.q1*w.q2 - z.q2*w.q1 + z.q3*w.q0)
(/)(z::Quaternion, w::Quaternion) = z*inv(w)

end # module

wordcount.jl

# wordcount.jl
#   
# Implementation of parallelized "word-count" of a text, inspired by the 
# Hadoop WordCount example. Uses @spawn and fetch() to parallelize 
# the "map" task. Reduce is currently done single-threaded.
#
# To run in parallel on a string stored in variable `text`:
#  julia -p <N> 
#  julia> require("<julia_dir>/examples/wordcount.jl")
#  julia> ...(define text)...
#  julia> counts=parallel_wordcount(text)
#
# Or to run on a group of files, writing results to an output file:
#  julia -p <N>
#  julia> require("<julia_dir>/examples/wordcount.jl")
#  julia> wordcount_files("/tmp/output.txt", "/tmp/input1.txt","/tmp/input2.txt",...) 

# "Map" function.
# Takes a string. Returns a Dict with the number of times each word 
# appears in that string.
function wordcount(text)
    words=split(text,[' ','\n','\t','-','.',',',':',';'],false)
    counts=Dict()
    for w = words
        counts[w]=get(counts,w,0)+1
    end
    return counts
end

# "Reduce" function.
# Takes a collection of Dicts in the format returned by wordcount()
# Returns a Dict in which words that appear in multiple inputs
# have their totals added together.
function wcreduce(wcs)
    counts=Dict()
    for c in wcs, (k,v) in c
        counts[k] = get(counts,k,0)+v
    end
    return counts
end

# Splits input string into nprocs() equal-sized chunks (last one rounds up), 
# and @spawns wordcount() for each chunk to run in parallel. Then fetch()s
# results and performs wcreduce().
function parallel_wordcount(text)
    lines=split(text,'\n',false)
    np=nprocs()
    unitsize=ceil(length(lines)/np)
    wcounts={}
    rrefs={}
    # spawn procs
    for i=1:np
        first=unitsize*(i-1)+1
        last=unitsize*i
        if last>length(lines)
            last=length(lines)
        end
        subtext=join(lines[int(first):int(last)],"\n")
        push!(rrefs, @spawn wordcount( subtext ) )
    end
    # fetch results
    while length(rrefs)>0
        push!(wcounts,fetch(pop!(rrefs)))
    end
    # reduce
    count=wcreduce(wcounts)
    return count
end

# Takes the name of a result file, and a list of input file names.
# Combines the contents of all files, then performs a parallel_wordcount
# on the resulting string. Writes the results to result_file.
function wordcount_files(result_file,inputs...)
    text = ""
    for file in inputs
        open(file) do f
            text *= readall(f)
        end
    end
    wc = parallel_wordcount(text)
    open(result_file,"w") do f
        for (k,v) in wc
            println(f, k,"=",v)
        end
    end
end

lru.jl

module LRUExample
# An LRU (Least Recently Used) cache is an associative data structure which
# maintains its contents in an order such that the most recently used item
# is at the beginning of the structure, and the least recently used at the end.
#
# This file specifies two types of LRU caches, both with and without a size
# limit. BoundedLRU has a limit and evicts the LRU item if a new item is added
# after that bound is reached. UnboundedLRU does not have a maximum size, but
# can be used as a basis for more complex LRUs.
#
# LRUs should follow the interfaces for general collections, indexable
# collections, and associative collections.

# The standard implementation of an LRU backs a hash table with a doubly-linked
# list for O(1) operations when reordering on access and eviction. The Julia
# implementation instead backs the table with a Vector. For moderately-sized
# collections, the difference in performance is small, and this implmentation
# is simpler and easier to understand.

import Base.isempty, Base.length, Base.sizeof
import Base.start, Base.next, Base.done
import Base.haskey, Base.get
import Base.setindex!, Base.getindex, Base.delete!, Base.empty!
import Base.show

abstract LRU{K,V} <: Associative{K,V}

# Default cache size
const __MAXCACHE = 1024

type CacheItem{K,V}
    k::K
    v::V
end

type UnboundedLRU{K,V} <: LRU{K,V}
    ht::Dict
    q::Vector{CacheItem}

    UnboundedLRU() = new(Dict(), similar(Array(CacheItem,1), 0))
end
UnboundedLRU() = UnboundedLRU{Any, Any}()

type BoundedLRU{K,V} <: LRU{K,V}
    ht::Dict
    q::Vector{CacheItem}
    maxsize::Int

    BoundedLRU(m) = new(Dict(), similar(Array(CacheItem,1), 0), m)
    BoundedLRU() = BoundedLRU(__MAXCACHE)
end
BoundedLRU(m) = BoundedLRU{Any, Any}(m)
BoundedLRU() = BoundedLRU{Any, Any}()

## collections ##

isempty(lru::LRU) = isempty(lru.q)
length(lru::LRU) = length(lru.q)
haskey(lru::LRU, key) = haskey(lru.ht, key)

## associative ##

# Should this check count as an access?
haskey(lru::LRU, key) = haskey(lru.ht, key)

get(lru::LRU, key, default) = haskey(lru, key) ? lru[key] : default

function empty!(lru::LRU)
    empty!(lru.ht)
    empty!(lru.q)
end


show(io::IO, lru::UnboundedLRU) = print(io,"UnboundedLRU()")
show(io::IO, lru::BoundedLRU) = print(io,"BoundedLRU($(lru.maxsize))")

## indexable ##

# Method to do the second, slow lookup in the list with early return.
function locate(q, x)
    for i = 1:length(q)
        if q[i] == x
            return i
        end
    end
    error("Item not found.")
end

function getindex(lru::LRU, key)
    item = lru.ht[key]
    idx = locate(lru.q, item)
    splice!(lru.q, idx)
    unshift!(lru.q, item)
    item.v
end

function setindex!(lru::LRU, v, key)
    if haskey(lru, key)
        item = lru.ht[key]
        idx = locate(lru.q, item)
        item.v = v
        splice!(lru.q, idx)
    else
        item = CacheItem(key, v)
        lru.ht[key] = item
    end
    unshift!(lru.q, item)
end

# Eviction
function setindex!{V,K}(lru::BoundedLRU, v::V, key::K)
    invoke(setindex!, (LRU, V, K), lru, v, key)
    nrm = length(lru) - lru.maxsize
    for i in 1:nrm
        rm = pop!(lru.q)
        delete!(lru.ht, rm.k)
    end
end

## associative ##

function delete!(lru::LRU, key)
    item = lru.ht[key]
    idx = locate(lru.q, item)
    delete!(lru.ht, key)
    delete!(lru.q, idx)
end

end # module

typetree.jl

module TypeTrees
##
# Generate a text graphic of Julia modules type tree
##

# The node type holds the type of the cuurent node and a dict of subtypes
type TTNode
    strname::String
    typ::Type
    subtypes::Dict{String, TTNode}

    TTNode(sname::String, t::Type) = new(sname, t, Dict{String, TTNode}())
end

# Add a node to a dict if not added
function add_ttnode(subtypes::Dict{String, TTNode}, sname::String, tnode::TTNode)
    ret = get(subtypes, sname, nothing)
    (nothing == ret) && (ret = subtypes[sname] = tnode)
    ret
end

function add_ttnode(subtypes::Dict{String, TTNode}, sname::String, t::Type)
    ret = get(subtypes, sname, nothing)
    (nothing == ret) && (subtypes[sname] = ret = TTNode(sname, t))
    ret
end

# Get a string name for the type
typ_name(t::UnionType) = string(t)
typ_name(t::TypeConstructor) = string(t)
typ_name(t) = string(t.name)

# Store a type and its type hierarchy chain
# Recurse till we reach the top level type
function store_type(sname::String, t::UnionType)
    suptype = UnionType
    tnode = TTNode(sname, t)

    # store unions under UnionType type
    subtypes = store_type(typ_name(suptype), suptype)
    add_ttnode(subtypes, sname, tnode)

    # unions are also in a sense related to the types of their components
    for suptype = t.types
        subtypes = store_type(typ_name(suptype), suptype)
        add_ttnode(subtypes, sname, tnode)
    end

    return tnode.subtypes
end

function store_type(sname::String, t::TypeConstructor)
    suptype = t.body
    subtypes = store_type(typ_name(suptype), suptype)
    tnode = add_ttnode(subtypes, sname, t)
    return tnode.subtypes
end

function store_type(sname::String, t::DataType)
    suptype = super(t)
    subtypes = (suptype != t) ? store_type(typ_name(suptype), suptype) : types_tree
    tnode = add_ttnode(subtypes, sname, t)
    return tnode.subtypes
end

function store_type(sname::String, t::Tuple)
    tnode = add_ttnode(types_tree, sname, t)
    return tnode.subtypes
end

function store_type(sname::String, t)
    suptype = super(t)
    subtypes = (suptype != t) ? store_type(typ_name(suptype), suptype) : types_tree
    tnode = add_ttnode(subtypes, sname, t)
    return tnode.subtypes
end

# examine all symbols in module and store those that are types
function store_all_from(m::Module)
    for expr = names(m,true)
        try
            t = eval(m,expr)
            isa(t, Type) && store_type(string(expr), t)
        #catch ex
        #    println("Error adding ", string(expr), m, " (", ex, ")")
        end
    end
end

type_props(typ) = ""
type_props(typ::DataType) = string("<<", 
                                 typ.abstract    ? " abstract"    : " concrete", 
                                 typ.mutable     ? " mutable"     : " immutable",
                                 typ.pointerfree ? " pointerfree" : "",
                                 " size:", typ.size,
                                 " >>")

function print_tree(subtypes::Dict{String, TTNode}, pfx::String="")
    for n in sort!([keys(subtypes)...])
        v = subtypes[n]
        if(n == string(v.typ))
            println(pfx, "+- ", n, " ", type_props(v.typ))
        else
            println(pfx, "+- ", n, " = ",  v.typ, " ", type_props(v.typ))
        end
        print_tree(v.subtypes, pfx * ".  ")
    end
end


# TODO: optionally take module names in command line 
# TODO: sort output
# TODO: option to list subtrees of type tree, or other symbol types
const types_tree = Dict{String, TTNode}()

for m in (Base, Core, Main)
    store_all_from(m)
end

# print_tree(types_tree)

end # module

hpl.jl

## Based on "Multi-Threading and One-Sided Communication in Parallel LU Factorization"
## http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.138.4361&rank=7

function hpl_seq(A::Matrix, b::Vector)

    blocksize = 5

    n = size(A,1)
    A = [A b]

    B_rows = linspace(0, n, div(n,blocksize)+1)
    B_rows[end] = n 
    B_cols = [B_rows, [n+1]]
    nB = length(B_rows)
    depend = zeros(Bool, nB, nB) # In parallel, depend needs to be able to hold futures

    ## Small matrix case
    if nB <= 1
        x = A[1:n, 1:n] \ A[:,n+1]
        return x
    end

    ## Add a ghost row of dependencies to boostrap the computation
    for j=1:nB; depend[1,j] = true; end

    for i=1:(nB-1)
        ## Threads for panel factorizations
        I = (B_rows[i]+1):B_rows[i+1]
        #(depend[i+1,i], panel_p) = spawn(panel_factor_seq, I, depend[i,i])
        (depend[i+1,i], panel_p) = panel_factor_seq(A, I, depend[i,i])

        ## Threads for trailing updates
        for j=(i+1):nB
            J = (B_cols[j]+1):B_cols[j+1]    
            #depend[i+1,j] = spawn(trailing_update_seq, I, J, panel_p, depend[i+1,i],depend[i,j])
            depend[i+1,j] = trailing_update_seq(A, I, J, panel_p, depend[i+1,i],depend[i,j])
        end
    end

    ## Completion of the last diagonal block signals termination
    #wait(depend[nB, nB])

    ## Solve the triangular system
    x = triu(A[1:n,1:n]) \ A[:,n+1]

    return x

end ## hpl()


### Panel factorization ###

function panel_factor_seq(A, I, col_dep)
    n = size (A, 1)

    ## Enforce dependencies
    #wait(col_dep)

    ## Factorize a panel
    K = I[1]:n
    panel_p = lufact!(sub(A, K, I))[:p] # Economy mode

    ## Panel permutation 
    panel_p = K[panel_p]

    return (true, panel_p)

end ## panel_factor_seq()


### Trailing update ###

function trailing_update_seq(A, I, J, panel_p, row_dep, col_dep)

    n = size (A, 1)

    ## Enforce dependencies
    #wait(row_dep, col_dep)

    ## Apply permutation from pivoting 
    K = (I[end]+1):n       
    A[I[1]:n, J] = A[panel_p, J]

    ## Compute blocks of U 
    L = tril(A[I,I],-1) + eye(length(I))
    A[I, J] = L \ A[I, J]

    ## Trailing submatrix update
    if !isempty(K)
        A[K,J] = A[K,J] - A[K,I]*A[I,J]  
    end

    return true

end ## trailing_update_seq()

# This version is written for a shared memory implementation.
# The matrix A is local to the first Worker, which allocates work to other Workers
# All updates to A are carried out by the first Worker. Thus A is not distributed

hpl_par(A::Matrix, b::Vector) = hpl_par(A, b, max(1, div(max(size(A)),4)), true)

hpl_par(A::Matrix, b::Vector, bsize::Integer) = hpl_par(A, b, bsize, true)

function hpl_par(A::Matrix, b::Vector, blocksize::Integer, run_parallel::Bool)

    n = size(A,1)
    A = [A b]

    if blocksize < 1
       throw(ArgumentError("hpl_par: invalid blocksize: $blocksize < 1"))
    end

    B_rows = linspace(0, n, div(n,blocksize)+1)
    B_rows[end] = n 
    B_cols = [B_rows, [n+1]]
    nB = length(B_rows)
    depend = cell(nB, nB)

    ## Small matrix case
    if nB <= 1
        x = A[1:n, 1:n] \ A[:,n+1]
        return x
    end

    ## Add a ghost row of dependencies to boostrap the computation
    for j=1:nB; depend[1,j] = true; end
    for i=2:nB, j=1:nB; depend[i,j] = false; end

    for i=1:(nB-1)
        #println("A=$A") #####
        ## Threads for panel factorizations
        I = (B_rows[i]+1):B_rows[i+1]
        K = I[1]:n
        (A_KI, panel_p) = panel_factor_par(A[K,I], depend[i,i])

        ## Write the factorized panel back to A
        A[K,I] = A_KI

        ## Panel permutation 
        panel_p = K[panel_p]
        depend[i+1,i] = true

        ## Apply permutation from pivoting
        J = (B_cols[i+1]+1):B_cols[nB+1]
        A[K, J] = A[panel_p, J]
        ## Threads for trailing updates
        #L_II = tril(A[I,I], -1) + eye(length(I))
        L_II = tril(sub(A,I,I), -1) + eye(length(I))
        K = (I[length(I)]+1):n
        A_KI = A[K,I]

        for j=(i+1):nB
            J = (B_cols[j]+1):B_cols[j+1]

            ## Do the trailing update (Compute U, and DGEMM - all flops are here)
            if run_parallel
                A_IJ = A[I,J]
                #A_KI = A[K,I]
                A_KJ = A[K,J]
                depend[i+1,j] = @spawn trailing_update_par(L_II, A_IJ, A_KI, A_KJ, depend[i+1,i], depend[i,j])
            else
                depend[i+1,j] = trailing_update_par(L_II, A[I,J], A[K,I], A[K,J], depend[i+1,i], depend[i,j])
            end
        end

        # Wait for all trailing updates to complete, and write back to A
        for j=(i+1):nB
            J = (B_cols[j]+1):B_cols[j+1]
            if run_parallel
                (A_IJ, A_KJ) = fetch(depend[i+1,j])
            else
                (A_IJ, A_KJ) = depend[i+1,j]
            end
            A[I,J] = A_IJ
            A[K,J] = A_KJ
            depend[i+1,j] = true
        end

    end

    ## Completion of the last diagonal block signals termination
    @assert depend[nB, nB]

    ## Solve the triangular system
    x = triu(A[1:n,1:n]) \ A[:,n+1]

    return x

end ## hpl()


### Panel factorization ###

function panel_factor_par(A_KI, col_dep)

    @assert col_dep

    ## Factorize a panel
    panel_p = lufact!(A_KI)[:p] # Economy mode

    return (A_KI, panel_p)

end ## panel_factor_par()


### Trailing update ###

function trailing_update_par(L_II, A_IJ, A_KI, A_KJ, row_dep, col_dep)

    @assert row_dep
    @assert col_dep

    ## Compute blocks of U 
    A_IJ = L_II \ A_IJ

    ## Trailing submatrix update - All flops are here
    if !isempty(A_KJ)
        m, k = size(A_KI)
        n = size(A_IJ,2)
        blas_gemm('N','N',m,n,k,-1.0,A_KI,m,A_IJ,k,1.0,A_KJ,m)
        #A_KJ = A_KJ - A_KI*A_IJ
    end

    return (A_IJ, A_KJ)

end ## trailing_update_par()


### using DArrays ###

function hpl_par2(A::Matrix, b::Vector)
    n = size(A,1)
    A = [A b]

    C = distribute(A, 2)
    nB = length(C.pmap)

    ## case if only one processor
    if nB <= 1
        x = A[1:n, 1:n] \ A[:,n+1]
        return x
    end

    depend = Array(RemoteRef, nB, nB)

    #pmap[i] is where block i's stuff is
    #block i is dist[i] to dist[i+1]-1
    for i = 1:nB
        #println("C=$(convert(Array, C))") #####
        ##panel factorization
        panel_p = remotecall_fetch(C.pmap[i], panel_factor_par2, C, i, n)

        ## Apply permutation from pivoting
        for j = (i+1):nB
           depend[i,j] = remotecall(C.pmap[j], permute, C, i, j, panel_p, n, false)
        end
        ## Special case for last column
        if i == nB
           depend[nB,nB] = remotecall(C.pmap[nB], permute, C, i, nB+1, panel_p, n, true)
        end

        ##Trailing updates
        (i == nB) ? (I = (C.dist[i]):n) :
                    (I = (C.dist[i]):(C.dist[i+1]-1))
        C_II = C[I,I]
        L_II = tril(C_II, -1) + eye(length(I))
        K = (I[length(I)]+1):n
        if length(K) > 0
            C_KI = C[K,I]
        else
            C_KI = zeros(0)
        end

        for j=(i+1):nB
            dep = depend[i,j]
            depend[j,i] = remotecall(C.pmap[j], trailing_update_par2, C, L_II, C_KI, i, j, n, false, dep)
        end

        ## Special case for last column
        if i == nB
            dep = depend[nB,nB]
            remotecall_fetch(C.pmap[nB], trailing_update_par2, C, L_II, C_KI, i, nB+1, n, true, dep)
        else
            #enforce dependencies for nonspecial case
            for j=(i+1):nB
                wait(depend[j,i])
            end
        end
    end

    A = convert(Array, C)
    x = triu(A[1:n,1:n]) \ A[:,n+1]
end ## hpl_par2()

function panel_factor_par2(C, i, n)
    (C.dist[i+1] == n+2) ? (I = (C.dist[i]):n) :
                           (I = (C.dist[i]):(C.dist[i+1]-1))
    K = I[1]:n
    C_KI = C[K,I]
    #(C_KI, panel_p) = lu!(C_KI) #economy mode
    panel_p = lu!(C_KI)[2]
    C[K,I] = C_KI

    return panel_p
end ##panel_factor_par2()

function permute(C, i, j, panel_p, n, flag)
    if flag
        K = (C.dist[i]):n
        J = (n+1):(n+1)
        C_KJ = C[K,J]

        C_KJ = C_KJ[panel_p,:]
        C[K,J] = C_KJ
    else
        K = (C.dist[i]):n
        J = (C.dist[j]):(C.dist[j+1]-1)
        C_KJ = C[K,J]

        C_KJ = C_KJ[panel_p,:]
        C[K,J] = C_KJ
    end
end ##permute()

function trailing_update_par2(C, L_II, C_KI, i, j, n, flag, dep)
    if isa(dep, RemoteRef); wait(dep); end
    if flag
        #(C.dist[i+1] == n+2) ? (I = (C.dist[i]):n) :
        #                       (I = (C.dist[i]):(C.dist[i+1]-1))
        I = C.dist[i]:n
        J = (n+1):(n+1)
        K = (I[length(I)]+1):n
        C_IJ = C[I,J]
        if length(K) > 0
            C_KJ = C[K,J]
        else
            C_KJ = zeros(0)
        end
        ## Compute blocks of U
        C_IJ = L_II \ C_IJ
        C[I,J] = C_IJ
    else
        #(C.dist[i+1] == n+2) ? (I = (C.dist[i]):n) :
        #                       (I = (C.dist[i]):(C.dist[i+1]-1))

        I = (C.dist[i]):(C.dist[i+1]-1)
        J = (C.dist[j]):(C.dist[j+1]-1)
        K = (I[length(I)]+1):n
        C_IJ = C[I,J]
        if length(K) > 0
            C_KJ = C[K,J]
        else
            C_KJ = zeros(0)
        end

        ## Compute blocks of U
        C_IJ = L_II \ C_IJ
        C[I,J] = C_IJ
        ## Trailing submatrix update - All flops are here
        if !isempty(C_KJ)
            cm, ck = size(C_KI)
            cn = size(C_IJ,2)
            blas_gemm('N','N',cm,cn,ck,-1.0,C_KI,cm,C_IJ,ck,1.0,C_KJ,cm)
            #C_KJ = C_KJ - C_KI*C_IJ
            C[K,J] = C_KJ
        end   
    end 
end ## trailing_update_par2()

## Test n*n matrix on np processors
## Prints 5 numbers that should be close to zero
function test(n, np)
    A = rand(n,n); b = rand(n);
    X = (@elapsed x = A \ b);
    Y = (@elapsed y = hpl_par(A,b, max(1,div(n,np))));
    Z = (@elapsed z = hpl_par2(A,b));
    for i=1:(min(5,n))
        print(z[i]-y[i], " ")
    end
    println()
    return (X,Y,Z)
end

## test k times and collect average
function test(n,np,k)
    sum1 = 0; sum2 = 0; sum3 = 0;
    for i = 1:k
        (X,Y,Z) = test(n,np)
        sum1 += X
        sum2 += Y
        sum3 += Z
    end
    return (sum1/k, sum2/k, sum3/k)
end