# MarkovMatrices.awk # rgs, imim, april 98 # add pseudocounts, june 99 # usage=gawk -f MarkovMatrices order output_prefix cds_seqfile (.tbl) # # based on Borodosky and McIninch, Computers & Chemistry, 1993. BEGIN { PCOUNT=5; k=ARGV[1]; #order of the markov chain pmat=ARGV[2]; # output matrix file name ARGV[1]=ARGV[2]=""; # set pseudocounts alpha[1]="A"; alpha[2]="C"; alpha[3]="G"; alpha[4]="T"; # for all k-tuples for (i=1;i<=4;i++) nx(1,k,alpha[i],N0,4*PCOUNT); sizek=4^k; # for all k+1-tuples for (i=1;i<=4;i++) nx(1,k+1,alpha[i],N,PCOUNT); sizek1=4^(k+1); } { sequence=toupper($2); if (sequence !~ /[^ACGT]/) { # consider only standard acgt lseq=length(sequence); L+=(lseq-k); for (i=1;i<=lseq-k;i++) { ktuple=substr(sequence,i,k); ktuple1=substr(sequence,i,k+1); N0[ktuple]++; # intial probabilities N[ktuple1]++; # transition probabilities } } } END { # get number of k-tuples observed in each frame L0=L+sizek; for (t in N0) { # initial probabilities P0[t]=N0[t]/(L0); } for (t in N) { # transition probabilities tk=substr(t,1,k); # k-tuple P[t,tk]=N[t]/N0[tk]; } for (i in P0) print i,P0[i] > pmat "." k ".initial"; for (i in P) { split (i,x,SUBSEP); print x[1],x[2], P[i] > pmat "." k ".transition"; } } function nx(l,len,s,Mat,p, i) { if (l==len) { Mat[s]=p; size++; } else { l++; for (i=1;i<=4;i++) nx(l,len,s alpha[i], Mat,p); } }