Julia is a high-level, high-performance dynamic programming language for technical computing. This site is a non official series of examples of Julia, for more details see about.
This script prints a string identical to it's own source code, see here.
github.com/karbarcca/Rosetta-Julia/blob/master/src/Quine.jl: In Julia, $x in a string literal interpolates the value of the variable into the string. $(expression) evaluates the expression and interpolates the result into the string. Normally, the string value "hi\tworld" would be inserted without quotation marks and with a literal tab
The repr function returns a string value that contains quotation marks and in which the literal tab is replaced by the characters \t. When the result of the repr function is interpolated, the result is what you would type into your code to create that string literal.
x="println(\"x=\$(repr(x))\\n\$x\")" println("x=$(repr(x))\n$x")
import Base.Sort immutable BubbleSortAlg <: Sort.Algorithm end const BubbleSort = BubbleSortAlg() function Base.sort!(v::AbstractVector, lo::Int, hi::Int, ::BubbleSortAlg, o::Sort.Ordering) while true clean = true for i = lo:hi-1 if Sort.lt(o, v[i+1], v[i]) v[i+1], v[i] = v[i], v[i+1] clean = false end end clean && break end return v end
Some description here.
macro enum(T,syms...) blk = quote immutable $(esc(T)) n::Int32 $(esc(T))(n::Integer) = new(n) end Base.show(io::IO, x::$(esc(T))) = print(io, $syms[x.n+1]) Base.show(io::IO, x::Type{$(esc(T))}) = print(io, $(string("enum ", T, ' ', '(', join(syms, ", "), ')'))) end for (i,sym) in enumerate(syms) push!(blk.args, :(const $(esc(sym)) = $(esc(T))($(i-1)))) end push!(blk.args, :nothing) blk.head = :toplevel return blk end
module ModInts importall Base immutable ModInt{n} <: Integer k::Int ModInt(k) = new(mod(k,n)) end -{n}(a::ModInt{n}) = ModInt{n}(-a.k) +{n}(a::ModInt{n}, b::ModInt{n}) = ModInt{n}(a.k+b.k) -{n}(a::ModInt{n}, b::ModInt{n}) = ModInt{n}(a.k-b.k) *{n}(a::ModInt{n}, b::ModInt{n}) = ModInt{n}(a.k*b.k) convert{n}(::Type{ModInt{n}}, i::Int) = ModInt{n}(i) promote_rule{n}(::Type{ModInt{n}}, ::Type{Int}) = ModInt{n} show{n}(io::IO, k::ModInt{n}) = print(io, "$(k.k) mod $n") showcompact(io::IO, k::ModInt) = print(io, k.k) end # module
Example of the 8 Queens Puzzle.
addqueen(queens::Array{Vector{Int}}, queen::Vector{Int}) = push!(copy(queens), queen) hitsany(queen::Vector{Int}, queens::Array{Vector{Int}}) = any(map(x->hits(queen, x), queens)) hits(a::Array{Int}, b::Array{Int}) = any(a .== b) || abs(a-b)[1] == abs(a-b)[2] function solve(x, y, n, d=Array(Vector{Int}, 0)) if n == 0 return d end for px = 1:x for py = 1:y if !hitsany([px, py], d) s = solve(x, y, n-1, addqueen(d, [px, py])) s != nothing && return s end end end return nothing end
importall Base # figure 5.2 from principles of parallel programming, ported to julia. # sum a vector using a tree on top of local reductions. function sum(v::DArray) P = procs(v) nodeval = { RemoteRef(p) for p=P } answer = RemoteRef() np = numel(P) for i=0:np-1 @spawnat P[i+1] begin stride=1 tally = sum(localpart(v)) while stride < np if i%(2*stride) == 0 tally = tally + take(nodeval[i+stride]) stride = 2*stride else put(nodeval[i], tally) break end end if i==0 put(answer, tally) end end end fetch(answer) end function reduce(f, v::DArray) mapreduce(fetch, f, { @spawnat p reduce(f,localpart(v)) for p = procs(v) }) end # possibly-useful abstraction: type RefGroup refs::Array{RemoteRef,1} RefGroup(P) = new([ RemoteRef(p) for p=P ]) end getindex(r::RefGroup, i) = fetch(r.refs[i]) setindex!(r::RefGroup, v, i) = put(r.refs[i], v)
module Time export TimeDelta import Base.show, Base.+, Base.-, Base.convert, Base.promote_rule immutable TimeDelta{p} v::Int64 end const PREFIXES = [ "yocto", "zepto", "atto", "femto", "pico", "nano", "micro", "milli", "", "kilo", "mega", "giga", "tera", "peta", "exa", "zetta", "yotta", ] const ZERO_INDEX = 9 const MAX_INDEX = 17 function show{p}(io::IO, x::TimeDelta{p}) k = max(1,min(MAX_INDEX,fld(p,3)+ZERO_INDEX)) r = p-3(k-ZERO_INDEX) prefix = PREFIXES[k] if r == 0 s = x.v == 1 ? "" : "s" print(io, "$(x.v) $(prefix)second$s") elseif r > 0 print(io, "$(x.v*10^r) $(prefix)seconds") else print(io, "$(x.v/10^-r) $(prefix)seconds") end end convert{p,q}(::Type{TimeDelta{p}}, x::TimeDelta{q}) = TimeDelta{p}(p <= q ? x.v*10^(q-p) : div(x.v,10^(p-q))) promote_rule{p,q}(::Type{TimeDelta{p}}, ::Type{TimeDelta{q}}) = TimeDelta{min(p,q)} -{p}(x::TimeDelta{p}) = TimeDelta{p}(-x.v) +{p}(x::TimeDelta{p}, y::TimeDelta{p}) = TimeDelta{p}(x.v+y.v) -{p}(x::TimeDelta{p}, y::TimeDelta{p}) = TimeDelta{p}(x.v-y.v) +(x::TimeDelta, y::TimeDelta) = +(promote(x,y)...) -(x::TimeDelta, y::TimeDelta) = -(promote(x,y)...) end # module
ndgrid(v::AbstractVector) = copy(v) function ndgrid{T}(v1::AbstractVector{T}, v2::AbstractVector{T}) m, n = length(v1), length(v2) v1 = reshape(v1, m, 1) v2 = reshape(v2, 1, n) (repmat(v1, 1, n), repmat(v2, m, 1)) end function ndgrid_fill(a, v, s, snext) for j = 1:length(a) a[j] = v[div(rem(j-1, snext), s)+1] end end function ndgrid{T}(vs::AbstractVector{T}...) n = length(vs) sz = map(length, vs) out = ntuple(n, i->Array(T, sz)) s = 1 for i=1:n a = out[i]::Array v = vs[i] snext = s*size(a,i) ndgrid_fill(a, v, s, snext) s = snext end out end meshgrid(v::AbstractVector) = meshgrid(v, v) function meshgrid{T}(vx::AbstractVector{T}, vy::AbstractVector{T}) m, n = length(vy), length(vx) vx = reshape(vx, 1, n) vy = reshape(vy, m, 1) (repmat(vx, m, 1), repmat(vy, 1, n)) end function meshgrid{T}(vx::AbstractVector{T}, vy::AbstractVector{T}, vz::AbstractVector{T}) m, n, o = length(vy), length(vx), length(vz) vx = reshape(vx, 1, n, 1) vy = reshape(vy, m, 1, 1) vz = reshape(vz, 1, 1, o) om = ones(Int, m) on = ones(Int, n) oo = ones(Int, o) (vx[om, :, oo], vy[:, on, oo], vz[om, on, :]) end
using LRUExample TestLRU = LRUExample.UnboundedLRU{ASCIIString, ASCIIString}() TestBLRU = LRUExample.BoundedLRU{ASCIIString, ASCIIString}(1000) get_str(i) = ascii(vcat(map(x->[x>>4; x&0x0F], reinterpret(Uint8, [int32(i)]))...)) isbounded{L<:LRUExample.LRU}(::Type{L}) = any(map(n->n==:maxsize, L.names)) isbounded{L<:LRUExample.LRU}(l::L) = isbounded(L) nmax = int(logspace(2, 5, 4)) function lrutest() #println("LRU consistency tests") for lru in (TestLRU,TestBLRU) for n in nmax empty!(lru) #@printf(" %s, %d items\n", lru, n) #print(" Simple eviction: ") for i in 1:n str = get_str(i) lru[str] = str @assert lru.q[1].v == str if isbounded(lru) && length(lru) >= lru.maxsize tailstr = get_str(i-lru.maxsize+1) @assert lru.q[end].v == tailstr end end #println("pass") #print(" Lookup, random access: ") for i in 1:n str = get_str(rand(1:n)) if haskey(lru, str) # the bounded LRUs can have cache misses blah = lru[str] @assert lru.q[1].v == blah end end #println("pass") end empty!(lru) end end lrutest()
function add_method(gf, an, at, body) argexs = { Expr(symbol("::"), an[i], at[i]) for i=1:length(an) } def = quote let __F__=($gf) function __F__($(argexs...)) $body end end end eval(def) end macro staged(fdef) if !isa(fdef,Expr) || !is(fdef.head,:function) error("@staged: expected method definition") end fname = fdef.args[1].args[1] argspec = fdef.args[1].args[2:end] argnames = map(x->(isa(x,Expr) ? x.args[1] : x), argspec) qargnames = map(x->Expr(:quote,x), argnames) fbody = fdef.args[2] @gensym gengf argtypes expander genbody quote let ($gengf) global ($fname) # should be "outer" local ($expander) function ($expander)($(argnames...)) $fbody end ($gengf)() = 0 # should be initially empty GF function ($fname)($(argspec...)) ($argtypes) = typeof(tuple($(argnames...))) if !method_exists($gengf, $argtypes) ($genbody) = apply(($expander), ($argtypes)) add_method($gengf, {$(qargnames...)}, $argtypes, $genbody) end return ($gengf)($(argnames...)) end end end end # example @staged function nloops(dims::Tuple) names = map(x->gensym(), dims) ex = quote println([$(names...)]) end for i = 1:length(dims) ex = quote for $(names[i]) in dims[$i] $ex end end end ex end
function life_rule(old) m, n = size(old) new = similar(old, m-2, n-2) for j = 2:n-1 for i = 2:m-1 nc = +(old[i-1,j-1], old[i-1,j], old[i-1,j+1], old[i ,j-1], old[i ,j+1], old[i+1,j-1], old[i+1,j], old[i+1,j+1]) new[i-1,j-1] = (nc == 3 ? 1 : nc == 2 ? old[i,j] : 0) end end new end function life_step(d) DArray(size(d),procs(d)) do I # fetch neighborhood - toroidal boundaries top = mod(first(I[1])-2,size(d,1))+1 bot = mod( last(I[1]) ,size(d,1))+1 left = mod(first(I[2])-2,size(d,2))+1 right = mod( last(I[2]) ,size(d,2))+1 old = Array(Bool, length(I[1])+2, length(I[2])+2) @sync begin @async old[1 , 1 ] = d[top , left] # left side @async old[2:end-1, 1 ] = d[I[1], left] @async old[end , 1 ] = d[bot , left] @async old[1 , 2:end-1] = d[top , I[2]] @async old[2:end-1, 2:end-1] = d[I[1], I[2]] # middle @async old[end , 2:end-1] = d[bot , I[2]] @async old[1 , end ] = d[top , right] # right side @async old[2:end-1, end ] = d[I[1], right] @async old[end , end ] = d[bot , right] end life_rule(old) end end function plife(m, n) w = Window("parallel life", n, m) c = Canvas(w) pack(c) done = false c.mouse.button1press = (c,x,y)->(done=true) cr = getgc(c) grid = DArray(I->convert(Array{Bool,2},randbool(map(length,I))), (m, n), workers()) last = time(); f = 1 while !done @async begin img = convert(Array{Uint32,2},grid) .* 0x00ffffff set_source_surface(cr, CairoRGBSurface(img), 0, 0) paint(cr) reveal(c) end t = time() if t-last > 2 println("$(f/(t-last)) FPS") last = t; f = 0 end grid = life_step(grid) f += 1 sleep(0.01) end end
module Quaternions import Base: convert, promote_rule, show, real, imag, conj, abs, abs2, inv, +, -, /, * immutable Quaternion{T<:Real} <: Number q0::T q1::T q2::T q3::T end Quaternion(q0::Real,q1::Real,q2::Real,q3::Real) = Quaternion(promote(q0,q1,q2,q3)...) convert{T}(::Type{Quaternion{T}}, x::Real) = Quaternion(convert(T,x), zero(T), zero(T), zero(T)) convert{T}(::Type{Quaternion{T}}, z::Complex) = Quaternion(convert(T,real(z)), convert(T,imag(z)), zero(T), zero(T)) convert{T}(::Type{Quaternion{T}}, z::Quaternion) = Quaternion(convert(T,z.q0), convert(T,z.q1), convert(T,z.q2), convert(T,z.q3)) promote_rule{s,S}(::Type{MathConst{s}}, ::Type{Quaternion{S}}) = Quaternion{S} promote_rule{T,S}(::Type{Complex{T}}, ::Type{Quaternion{S}}) = Quaternion{promote_type(T,S)} promote_rule{S}(::Type{Bool}, ::Type{Quaternion{S}}) = Quaternion{S} promote_rule{T<:Real,S}(::Type{T}, ::Type{Quaternion{S}}) = Quaternion{promote_type(T,S)} promote_rule{T,S}(::Type{Quaternion{T}}, ::Type{Quaternion{S}}) = Quaternion{promote_type(T,S)} function show(io::IO, z::Quaternion) pm(x) = x < 0 ? " - $(-x)" : " + $x" print(io, z.q0, pm(z.q1), "i", pm(z.q2), "j", pm(z.q3), "k") end real(z::Quaternion) = z.q0 imag(z::Quaternion) = z.q1 conj(z::Quaternion) = Quaternion(z.q0, -z.q1, -z.q2, -z.q3) abs(z::Quaternion) = sqrt(z.q0*z.q0 + z.q1*z.q1 + z.q2*z.q2 + z.q3*z.q3) abs2(z::Quaternion) = z.q0*z.q0 + z.q1*z.q1 + z.q2*z.q2 + z.q3*z.q3 inv(z::Quaternion) = conj(z)/abs2(z) (-)(z::Quaternion) = Quaternion(-z.q0, -z.q1, -z.q2, -z.q3) (/)(z::Quaternion, x::Real) = Quaternion(z.q0/x, z.q1/x, z.q2/x, z.q3/x) (+)(z::Quaternion, w::Quaternion) = Quaternion(z.q0 + w.q0, z.q1 + w.q1, z.q2 + w.q2, z.q3 + w.q3) (-)(z::Quaternion, w::Quaternion) = Quaternion(z.q0 - w.q0, z.q1 - w.q1, z.q2 - w.q2, z.q3 - w.q3) (*)(z::Quaternion, w::Quaternion) = Quaternion(z.q0*w.q0 - z.q1*w.q1 - z.q2*w.q2 - z.q3*w.q3, z.q0*w.q1 + z.q1*w.q0 + z.q2*w.q3 - z.q3*w.q2, z.q0*w.q2 - z.q1*w.q3 + z.q2*w.q0 + z.q3*w.q1, z.q0*w.q3 + z.q1*w.q2 - z.q2*w.q1 + z.q3*w.q0) (/)(z::Quaternion, w::Quaternion) = z*inv(w) end # module
# wordcount.jl # # Implementation of parallelized "word-count" of a text, inspired by the # Hadoop WordCount example. Uses @spawn and fetch() to parallelize # the "map" task. Reduce is currently done single-threaded. # # To run in parallel on a string stored in variable `text`: # julia -p <N> # julia> require("<julia_dir>/examples/wordcount.jl") # julia> ...(define text)... # julia> counts=parallel_wordcount(text) # # Or to run on a group of files, writing results to an output file: # julia -p <N> # julia> require("<julia_dir>/examples/wordcount.jl") # julia> wordcount_files("/tmp/output.txt", "/tmp/input1.txt","/tmp/input2.txt",...) # "Map" function. # Takes a string. Returns a Dict with the number of times each word # appears in that string. function wordcount(text) words=split(text,[' ','\n','\t','-','.',',',':',';'],false) counts=Dict() for w = words counts[w]=get(counts,w,0)+1 end return counts end # "Reduce" function. # Takes a collection of Dicts in the format returned by wordcount() # Returns a Dict in which words that appear in multiple inputs # have their totals added together. function wcreduce(wcs) counts=Dict() for c in wcs, (k,v) in c counts[k] = get(counts,k,0)+v end return counts end # Splits input string into nprocs() equal-sized chunks (last one rounds up), # and @spawns wordcount() for each chunk to run in parallel. Then fetch()s # results and performs wcreduce(). function parallel_wordcount(text) lines=split(text,'\n',false) np=nprocs() unitsize=ceil(length(lines)/np) wcounts={} rrefs={} # spawn procs for i=1:np first=unitsize*(i-1)+1 last=unitsize*i if last>length(lines) last=length(lines) end subtext=join(lines[int(first):int(last)],"\n") push!(rrefs, @spawn wordcount( subtext ) ) end # fetch results while length(rrefs)>0 push!(wcounts,fetch(pop!(rrefs))) end # reduce count=wcreduce(wcounts) return count end # Takes the name of a result file, and a list of input file names. # Combines the contents of all files, then performs a parallel_wordcount # on the resulting string. Writes the results to result_file. function wordcount_files(result_file,inputs...) text = "" for file in inputs open(file) do f text *= readall(f) end end wc = parallel_wordcount(text) open(result_file,"w") do f for (k,v) in wc println(f, k,"=",v) end end end
module LRUExample # An LRU (Least Recently Used) cache is an associative data structure which # maintains its contents in an order such that the most recently used item # is at the beginning of the structure, and the least recently used at the end. # # This file specifies two types of LRU caches, both with and without a size # limit. BoundedLRU has a limit and evicts the LRU item if a new item is added # after that bound is reached. UnboundedLRU does not have a maximum size, but # can be used as a basis for more complex LRUs. # # LRUs should follow the interfaces for general collections, indexable # collections, and associative collections. # The standard implementation of an LRU backs a hash table with a doubly-linked # list for O(1) operations when reordering on access and eviction. The Julia # implementation instead backs the table with a Vector. For moderately-sized # collections, the difference in performance is small, and this implmentation # is simpler and easier to understand. import Base.isempty, Base.length, Base.sizeof import Base.start, Base.next, Base.done import Base.haskey, Base.get import Base.setindex!, Base.getindex, Base.delete!, Base.empty! import Base.show abstract LRU{K,V} <: Associative{K,V} # Default cache size const __MAXCACHE = 1024 type CacheItem{K,V} k::K v::V end type UnboundedLRU{K,V} <: LRU{K,V} ht::Dict q::Vector{CacheItem} UnboundedLRU() = new(Dict(), similar(Array(CacheItem,1), 0)) end UnboundedLRU() = UnboundedLRU{Any, Any}() type BoundedLRU{K,V} <: LRU{K,V} ht::Dict q::Vector{CacheItem} maxsize::Int BoundedLRU(m) = new(Dict(), similar(Array(CacheItem,1), 0), m) BoundedLRU() = BoundedLRU(__MAXCACHE) end BoundedLRU(m) = BoundedLRU{Any, Any}(m) BoundedLRU() = BoundedLRU{Any, Any}() ## collections ## isempty(lru::LRU) = isempty(lru.q) length(lru::LRU) = length(lru.q) haskey(lru::LRU, key) = haskey(lru.ht, key) ## associative ## # Should this check count as an access? haskey(lru::LRU, key) = haskey(lru.ht, key) get(lru::LRU, key, default) = haskey(lru, key) ? lru[key] : default function empty!(lru::LRU) empty!(lru.ht) empty!(lru.q) end show(io::IO, lru::UnboundedLRU) = print(io,"UnboundedLRU()") show(io::IO, lru::BoundedLRU) = print(io,"BoundedLRU($(lru.maxsize))") ## indexable ## # Method to do the second, slow lookup in the list with early return. function locate(q, x) for i = 1:length(q) if q[i] == x return i end end error("Item not found.") end function getindex(lru::LRU, key) item = lru.ht[key] idx = locate(lru.q, item) splice!(lru.q, idx) unshift!(lru.q, item) item.v end function setindex!(lru::LRU, v, key) if haskey(lru, key) item = lru.ht[key] idx = locate(lru.q, item) item.v = v splice!(lru.q, idx) else item = CacheItem(key, v) lru.ht[key] = item end unshift!(lru.q, item) end # Eviction function setindex!{V,K}(lru::BoundedLRU, v::V, key::K) invoke(setindex!, (LRU, V, K), lru, v, key) nrm = length(lru) - lru.maxsize for i in 1:nrm rm = pop!(lru.q) delete!(lru.ht, rm.k) end end ## associative ## function delete!(lru::LRU, key) item = lru.ht[key] idx = locate(lru.q, item) delete!(lru.ht, key) delete!(lru.q, idx) end end # module
module TypeTrees ## # Generate a text graphic of Julia modules type tree ## # The node type holds the type of the cuurent node and a dict of subtypes type TTNode strname::String typ::Type subtypes::Dict{String, TTNode} TTNode(sname::String, t::Type) = new(sname, t, Dict{String, TTNode}()) end # Add a node to a dict if not added function add_ttnode(subtypes::Dict{String, TTNode}, sname::String, tnode::TTNode) ret = get(subtypes, sname, nothing) (nothing == ret) && (ret = subtypes[sname] = tnode) ret end function add_ttnode(subtypes::Dict{String, TTNode}, sname::String, t::Type) ret = get(subtypes, sname, nothing) (nothing == ret) && (subtypes[sname] = ret = TTNode(sname, t)) ret end # Get a string name for the type typ_name(t::UnionType) = string(t) typ_name(t::TypeConstructor) = string(t) typ_name(t) = string(t.name) # Store a type and its type hierarchy chain # Recurse till we reach the top level type function store_type(sname::String, t::UnionType) suptype = UnionType tnode = TTNode(sname, t) # store unions under UnionType type subtypes = store_type(typ_name(suptype), suptype) add_ttnode(subtypes, sname, tnode) # unions are also in a sense related to the types of their components for suptype = t.types subtypes = store_type(typ_name(suptype), suptype) add_ttnode(subtypes, sname, tnode) end return tnode.subtypes end function store_type(sname::String, t::TypeConstructor) suptype = t.body subtypes = store_type(typ_name(suptype), suptype) tnode = add_ttnode(subtypes, sname, t) return tnode.subtypes end function store_type(sname::String, t::DataType) suptype = super(t) subtypes = (suptype != t) ? store_type(typ_name(suptype), suptype) : types_tree tnode = add_ttnode(subtypes, sname, t) return tnode.subtypes end function store_type(sname::String, t::Tuple) tnode = add_ttnode(types_tree, sname, t) return tnode.subtypes end function store_type(sname::String, t) suptype = super(t) subtypes = (suptype != t) ? store_type(typ_name(suptype), suptype) : types_tree tnode = add_ttnode(subtypes, sname, t) return tnode.subtypes end # examine all symbols in module and store those that are types function store_all_from(m::Module) for expr = names(m,true) try t = eval(m,expr) isa(t, Type) && store_type(string(expr), t) #catch ex # println("Error adding ", string(expr), m, " (", ex, ")") end end end type_props(typ) = "" type_props(typ::DataType) = string("<<", typ.abstract ? " abstract" : " concrete", typ.mutable ? " mutable" : " immutable", typ.pointerfree ? " pointerfree" : "", " size:", typ.size, " >>") function print_tree(subtypes::Dict{String, TTNode}, pfx::String="") for n in sort!([keys(subtypes)...]) v = subtypes[n] if(n == string(v.typ)) println(pfx, "+- ", n, " ", type_props(v.typ)) else println(pfx, "+- ", n, " = ", v.typ, " ", type_props(v.typ)) end print_tree(v.subtypes, pfx * ". ") end end # TODO: optionally take module names in command line # TODO: sort output # TODO: option to list subtrees of type tree, or other symbol types const types_tree = Dict{String, TTNode}() for m in (Base, Core, Main) store_all_from(m) end # print_tree(types_tree) end # module
## Based on "Multi-Threading and One-Sided Communication in Parallel LU Factorization" ## http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.138.4361&rank=7 function hpl_seq(A::Matrix, b::Vector) blocksize = 5 n = size(A,1) A = [A b] B_rows = linspace(0, n, div(n,blocksize)+1) B_rows[end] = n B_cols = [B_rows, [n+1]] nB = length(B_rows) depend = zeros(Bool, nB, nB) # In parallel, depend needs to be able to hold futures ## Small matrix case if nB <= 1 x = A[1:n, 1:n] \ A[:,n+1] return x end ## Add a ghost row of dependencies to boostrap the computation for j=1:nB; depend[1,j] = true; end for i=1:(nB-1) ## Threads for panel factorizations I = (B_rows[i]+1):B_rows[i+1] #(depend[i+1,i], panel_p) = spawn(panel_factor_seq, I, depend[i,i]) (depend[i+1,i], panel_p) = panel_factor_seq(A, I, depend[i,i]) ## Threads for trailing updates for j=(i+1):nB J = (B_cols[j]+1):B_cols[j+1] #depend[i+1,j] = spawn(trailing_update_seq, I, J, panel_p, depend[i+1,i],depend[i,j]) depend[i+1,j] = trailing_update_seq(A, I, J, panel_p, depend[i+1,i],depend[i,j]) end end ## Completion of the last diagonal block signals termination #wait(depend[nB, nB]) ## Solve the triangular system x = triu(A[1:n,1:n]) \ A[:,n+1] return x end ## hpl() ### Panel factorization ### function panel_factor_seq(A, I, col_dep) n = size (A, 1) ## Enforce dependencies #wait(col_dep) ## Factorize a panel K = I[1]:n panel_p = lufact!(sub(A, K, I))[:p] # Economy mode ## Panel permutation panel_p = K[panel_p] return (true, panel_p) end ## panel_factor_seq() ### Trailing update ### function trailing_update_seq(A, I, J, panel_p, row_dep, col_dep) n = size (A, 1) ## Enforce dependencies #wait(row_dep, col_dep) ## Apply permutation from pivoting K = (I[end]+1):n A[I[1]:n, J] = A[panel_p, J] ## Compute blocks of U L = tril(A[I,I],-1) + eye(length(I)) A[I, J] = L \ A[I, J] ## Trailing submatrix update if !isempty(K) A[K,J] = A[K,J] - A[K,I]*A[I,J] end return true end ## trailing_update_seq() # This version is written for a shared memory implementation. # The matrix A is local to the first Worker, which allocates work to other Workers # All updates to A are carried out by the first Worker. Thus A is not distributed hpl_par(A::Matrix, b::Vector) = hpl_par(A, b, max(1, div(max(size(A)),4)), true) hpl_par(A::Matrix, b::Vector, bsize::Integer) = hpl_par(A, b, bsize, true) function hpl_par(A::Matrix, b::Vector, blocksize::Integer, run_parallel::Bool) n = size(A,1) A = [A b] if blocksize < 1 throw(ArgumentError("hpl_par: invalid blocksize: $blocksize < 1")) end B_rows = linspace(0, n, div(n,blocksize)+1) B_rows[end] = n B_cols = [B_rows, [n+1]] nB = length(B_rows) depend = cell(nB, nB) ## Small matrix case if nB <= 1 x = A[1:n, 1:n] \ A[:,n+1] return x end ## Add a ghost row of dependencies to boostrap the computation for j=1:nB; depend[1,j] = true; end for i=2:nB, j=1:nB; depend[i,j] = false; end for i=1:(nB-1) #println("A=$A") ##### ## Threads for panel factorizations I = (B_rows[i]+1):B_rows[i+1] K = I[1]:n (A_KI, panel_p) = panel_factor_par(A[K,I], depend[i,i]) ## Write the factorized panel back to A A[K,I] = A_KI ## Panel permutation panel_p = K[panel_p] depend[i+1,i] = true ## Apply permutation from pivoting J = (B_cols[i+1]+1):B_cols[nB+1] A[K, J] = A[panel_p, J] ## Threads for trailing updates #L_II = tril(A[I,I], -1) + eye(length(I)) L_II = tril(sub(A,I,I), -1) + eye(length(I)) K = (I[length(I)]+1):n A_KI = A[K,I] for j=(i+1):nB J = (B_cols[j]+1):B_cols[j+1] ## Do the trailing update (Compute U, and DGEMM - all flops are here) if run_parallel A_IJ = A[I,J] #A_KI = A[K,I] A_KJ = A[K,J] depend[i+1,j] = @spawn trailing_update_par(L_II, A_IJ, A_KI, A_KJ, depend[i+1,i], depend[i,j]) else depend[i+1,j] = trailing_update_par(L_II, A[I,J], A[K,I], A[K,J], depend[i+1,i], depend[i,j]) end end # Wait for all trailing updates to complete, and write back to A for j=(i+1):nB J = (B_cols[j]+1):B_cols[j+1] if run_parallel (A_IJ, A_KJ) = fetch(depend[i+1,j]) else (A_IJ, A_KJ) = depend[i+1,j] end A[I,J] = A_IJ A[K,J] = A_KJ depend[i+1,j] = true end end ## Completion of the last diagonal block signals termination @assert depend[nB, nB] ## Solve the triangular system x = triu(A[1:n,1:n]) \ A[:,n+1] return x end ## hpl() ### Panel factorization ### function panel_factor_par(A_KI, col_dep) @assert col_dep ## Factorize a panel panel_p = lufact!(A_KI)[:p] # Economy mode return (A_KI, panel_p) end ## panel_factor_par() ### Trailing update ### function trailing_update_par(L_II, A_IJ, A_KI, A_KJ, row_dep, col_dep) @assert row_dep @assert col_dep ## Compute blocks of U A_IJ = L_II \ A_IJ ## Trailing submatrix update - All flops are here if !isempty(A_KJ) m, k = size(A_KI) n = size(A_IJ,2) blas_gemm('N','N',m,n,k,-1.0,A_KI,m,A_IJ,k,1.0,A_KJ,m) #A_KJ = A_KJ - A_KI*A_IJ end return (A_IJ, A_KJ) end ## trailing_update_par() ### using DArrays ### function hpl_par2(A::Matrix, b::Vector) n = size(A,1) A = [A b] C = distribute(A, 2) nB = length(C.pmap) ## case if only one processor if nB <= 1 x = A[1:n, 1:n] \ A[:,n+1] return x end depend = Array(RemoteRef, nB, nB) #pmap[i] is where block i's stuff is #block i is dist[i] to dist[i+1]-1 for i = 1:nB #println("C=$(convert(Array, C))") ##### ##panel factorization panel_p = remotecall_fetch(C.pmap[i], panel_factor_par2, C, i, n) ## Apply permutation from pivoting for j = (i+1):nB depend[i,j] = remotecall(C.pmap[j], permute, C, i, j, panel_p, n, false) end ## Special case for last column if i == nB depend[nB,nB] = remotecall(C.pmap[nB], permute, C, i, nB+1, panel_p, n, true) end ##Trailing updates (i == nB) ? (I = (C.dist[i]):n) : (I = (C.dist[i]):(C.dist[i+1]-1)) C_II = C[I,I] L_II = tril(C_II, -1) + eye(length(I)) K = (I[length(I)]+1):n if length(K) > 0 C_KI = C[K,I] else C_KI = zeros(0) end for j=(i+1):nB dep = depend[i,j] depend[j,i] = remotecall(C.pmap[j], trailing_update_par2, C, L_II, C_KI, i, j, n, false, dep) end ## Special case for last column if i == nB dep = depend[nB,nB] remotecall_fetch(C.pmap[nB], trailing_update_par2, C, L_II, C_KI, i, nB+1, n, true, dep) else #enforce dependencies for nonspecial case for j=(i+1):nB wait(depend[j,i]) end end end A = convert(Array, C) x = triu(A[1:n,1:n]) \ A[:,n+1] end ## hpl_par2() function panel_factor_par2(C, i, n) (C.dist[i+1] == n+2) ? (I = (C.dist[i]):n) : (I = (C.dist[i]):(C.dist[i+1]-1)) K = I[1]:n C_KI = C[K,I] #(C_KI, panel_p) = lu!(C_KI) #economy mode panel_p = lu!(C_KI)[2] C[K,I] = C_KI return panel_p end ##panel_factor_par2() function permute(C, i, j, panel_p, n, flag) if flag K = (C.dist[i]):n J = (n+1):(n+1) C_KJ = C[K,J] C_KJ = C_KJ[panel_p,:] C[K,J] = C_KJ else K = (C.dist[i]):n J = (C.dist[j]):(C.dist[j+1]-1) C_KJ = C[K,J] C_KJ = C_KJ[panel_p,:] C[K,J] = C_KJ end end ##permute() function trailing_update_par2(C, L_II, C_KI, i, j, n, flag, dep) if isa(dep, RemoteRef); wait(dep); end if flag #(C.dist[i+1] == n+2) ? (I = (C.dist[i]):n) : # (I = (C.dist[i]):(C.dist[i+1]-1)) I = C.dist[i]:n J = (n+1):(n+1) K = (I[length(I)]+1):n C_IJ = C[I,J] if length(K) > 0 C_KJ = C[K,J] else C_KJ = zeros(0) end ## Compute blocks of U C_IJ = L_II \ C_IJ C[I,J] = C_IJ else #(C.dist[i+1] == n+2) ? (I = (C.dist[i]):n) : # (I = (C.dist[i]):(C.dist[i+1]-1)) I = (C.dist[i]):(C.dist[i+1]-1) J = (C.dist[j]):(C.dist[j+1]-1) K = (I[length(I)]+1):n C_IJ = C[I,J] if length(K) > 0 C_KJ = C[K,J] else C_KJ = zeros(0) end ## Compute blocks of U C_IJ = L_II \ C_IJ C[I,J] = C_IJ ## Trailing submatrix update - All flops are here if !isempty(C_KJ) cm, ck = size(C_KI) cn = size(C_IJ,2) blas_gemm('N','N',cm,cn,ck,-1.0,C_KI,cm,C_IJ,ck,1.0,C_KJ,cm) #C_KJ = C_KJ - C_KI*C_IJ C[K,J] = C_KJ end end end ## trailing_update_par2() ## Test n*n matrix on np processors ## Prints 5 numbers that should be close to zero function test(n, np) A = rand(n,n); b = rand(n); X = (@elapsed x = A \ b); Y = (@elapsed y = hpl_par(A,b, max(1,div(n,np)))); Z = (@elapsed z = hpl_par2(A,b)); for i=1:(min(5,n)) print(z[i]-y[i], " ") end println() return (X,Y,Z) end ## test k times and collect average function test(n,np,k) sum1 = 0; sum2 = 0; sum3 = 0; for i = 1:k (X,Y,Z) = test(n,np) sum1 += X sum2 += Y sum3 += Z end return (sum1/k, sum2/k, sum3/k) end