Count number of different occurrences in a string by UNIX along one column into a file
Your input doesn't match your output so we're all just guessing but this might be what you want:
$ cat tst.awkBEGIN { FS=OFS="\t" }{ delete cnt split($2,tmp,/ /) for (i in tmp) { str = tmp[i] cnt[str]++ } printf "%s", $0 sep = OFS for (str in cnt) { printf "%s%s=%d", sep, str, cnt[str] sep = ";" } print ""}
Depending on what your input really is the above will output the following:
$ cat filers12255619 A/C chr10 AA AA AC AA AA AA AA AA AA AC AArs7909677 A/G chr10 AA AA AA AA AA AA AA AA AA AA AA$ awk -f tst.awk filers12255619 A/C chr10 AA AA AC AA AA AA AA AA AA AC AA AA=9;AC=2rs7909677 A/G chr10 AA AA AA AA AA AA AA AA AA AA AA AA=11$ cat filers12255619 A/C chr10 AA AA AC AA AA AA AA AA AA AC AArs7909677 A/G chr10 AA AA AA AA AA AA AA AA AA AA CC$ awk -f tst.awk filers12255619 A/C chr10 AA AA AC AA AA AA AA AA AA AC AA AA=9;AC=2rs7909677 A/G chr10 AA AA AA AA AA AA AA AA AA AA CC AA=10;CC=1
something like this?
$ awk '{for(i=4;i<=NF;i++) c[$i]++; for(k in c) {s=s sep k"="c[k]; sep=";"; c[k]=0} $NF=$NF OFS s; s=sep=""}1' file | column -trs12255619 A/C chr10 AA AA AC AA AA AA AA AA AA AC AA AA=9;AC=2rs7909677 A/G chr10 AA AA AA AA AA AA AA AA AA AA AA AA=11;AC=0
note that the captured letters are progressively increasing since only the observed keys up to a row will be printed. For example if you had CC
in the second row, the count won't be listed in the first line.
Could do it in perl
perl -lpe '$a{$_}++ for /\b[A-Z]{2}\b/g; $_.=" ".join(";",map{"$_=$a{$_}"}keys%a); %a = map{$_=>0}keys%a' file
produces
rs12255619 A/C chr10 AA AA AC AA AA AA AA AA AA AC AA AA=9;AC=2rs7909677 A/G chr10 AA AA AA AA AA AA AA AA AA AA CC AA=10;CC=1;AC=0
For new requirement
perl -lpe '$a{$_}++ for /\b[A-Z]{2}\b/g; $_.=" ".join(";",map{"$_=$a{$_}"}keys%a); undef %a' file
produces
rs12255619 A/C chr10 AA AA AC AA AA AA AA AA AA AC AA AC=2;AA=9rs7909677 A/G chr10 AA AA AA AA AA AA AA AA AA AA CC CC=1;AA=10