Code to compute the correlation dimension as a function of "r". Note that the intrinsic dimensionality of a data set is, at most, the steepest slopes in a plot of log(C(r)) vs log(r).
@include "lib.awk"
@include "dist.awk"
@include "readcsv.awk"
function _cr( o) {
if(args("-d,data/nasa93.csv,-steps,20,-jumps,4",o))
cr(o["-d"],o["-steps"],o["-jumps"])
}
function cr(f,steps,jumps,
_Rows,r,k,c,x,logc,logr) {
readcsv("cat "f,_Rows)
distances(_Rows,x)
for(r=1/steps; r<=1 ; r+= 1/steps) {
c = correlationDimension(r,x,length(d))
if (c==0) continue
if (c==1) break
k++
print (logr[k] = log(r)) "\t" (logc[k] = log(c))
}
say("# " steepest(k,jumps,logr,logc) " " f "\n")
}
function distances(_Rows,x, i,j) {
for(i in all)
for(j in all)
if (j > i)
x[i][j] = dist(all[i],all[j],_Rows,1)
}
function correlationDimension(r,x,n, i,j,c) {
for(i in x)
for(j in x[i])
c += x[i][j] <= r
return 2/(n*(n-1)) * c
}
function steepest(max,jumps,logr,logc,
i,rise,run,m,most) {
for(i=1; i <= max-jumps; i += jumps) {
rise = logc[i + jumps] - logc[i]
run = logr[i + jumps] - logr[i]
m = rise / run
if (m > most)
most = m
}
return most
}
And the results are, sorted lowest to highest...
- low
- 0.91 data/china.csv
- 1.97 data/kemerer.csv
- 2.77 data/finnish.csv
- 2.92 data/miyazaki94.csv
- 3.00 data/albrecht.csv
- 3.35 data/nasa93c1.csv
- medium
- 3.70 data/coc81o.csv
- 3.96 data/telecom.csv
- 4.00 data/coc81sd.csv
- 4.07 data/coc81.csv
- 4.10 data/desharnais.csv
- high
- 4.51 data/nasa93c2.csv
- 4.54 data/nasa93c5.csv
- 4.78 data/coc81e.csv
- 5.74 data/nasa93.csv
- 8.19 data/sdr.csv
No comments:
Post a Comment