#!/usr/bin/perl use strict; my $debug = 1; my $dl = "\t"; # output file delimiter my $data_dir = '.'; # the directory the program is running in my $file_suffix = 'data.txt'; # valid raw data file suffix my @file_prefixes = ('wt','npr','cpr'); # ,'scntl'); # valid file prefixes for average_each_treatment_group() my (%mica_db,%matrix,%group_matrix,%base_pairs); # main data structures my @all_files = load_data_from_files($data_dir,$file_suffix,@file_prefixes); # log_normalize_data(); # normalize height and area versus reference peaks within each file # values expressed as a ratio to an average reference peak of 1000 normalize_within_sample(); # normalize height and area between samples: assume equal amount of # total DNA (express each peak as a percent of the total sample) normalize_between_samples(); # check_valid_peak_percentage_sum(); # peaks should sum up to 100 percent # filter out samples that are 10x less than the maximum sample # filter_based_on_total_sample('area',10); filter_within_each_group_based_on_total_sample('height',90); # only take the top N samples in each group # filter_groups_based_on_total('area',3); #filter_peaks_min_percent('area',0.01); # average_each_treatment_group( @file_prefixes ); # %matrix = %group_matrix; # now use the group matrix for analysis # subtract_soil_control(); # filter_group_based_on_duplicate_peaks(2); create_cluster_input_file(); # create_excel_graph_file(); # list_top_peak_range(10); #limit_peaks_per_sample(10); #show_all_peaks(); exit; # STOP HERE sort_by_base_pairs(); # print_matrix(); show_matching_mica_bacteria_list(); exit; ###################### END OF MAIN PROGRAM ############################# sub create_excel_graph_file { print "\nCreating excel files...\n"; open(HEIGHT,">cluster6_h.csv") or die "unable to write height csv file\n"; open(AREA,">cluster6_a.csv") or die "unable to write area csv file\n"; $dl = ','; # comma-delimiting foreach my $file ( sort keys %matrix ) { # if ( $file =~ /\Ascntl/ ) { delete $matrix{$file}; next; } print HEIGHT $file . $dl; print AREA $file . $dl; } print HEIGHT "\n"; print AREA "\n"; for (my $i=1; $i <= 500; $i++) { foreach my $file ( sort keys %matrix ) { my $h = $matrix{$file}{$i}{height} || 0; my $a = $matrix{$file}{$i}{area} || 0; print HEIGHT $h . $dl; print AREA $a . $dl; } print HEIGHT "\n"; print AREA "\n"; } close(HEIGHT); close(AREA); } sub create_cluster_input_file { open(HEIGHT,">cluster6_h.out") or die "unable to write height.out\n"; open(AREA,">cluster6_a.out") or die "unable to write area.out\n"; print HEIGHT "Type" . $dl . "name" . $dl . "weight" . $dl . "order" . $dl; print AREA "Type" . $dl . "name" . $dl . "weight" . $dl . "order" . $dl; # 5 rows, each with 500 columns my $line; foreach my $file ( sort keys %matrix ) { my $name = $file; $name =~ s/_data.txt\Z//; $line .= $name . $dl; } $line =~ s/$dl\Z//; print HEIGHT "$line\n"; print AREA "$line\n"; my $count = 0; for (my $i=1; $i <= 500; $i++) { my @row_h = ($i,$i,1.0,$count); my @row_a = ($i,$i,1.0,$count++); foreach my $file ( sort keys %matrix ) { if ( ! exists( $matrix{$file}{$i} ) ){ push(@row_h,0); # height push(@row_a,0); # area } else { push(@row_h, $matrix{$file}{$i}{height} * 1000); push(@row_a, $matrix{$file}{$i}{area} * 1000); } } print HEIGHT join($dl,@row_h) . "\n"; print AREA join($dl,@row_a) . "\n"; } close(HEIGHT); close(AREA); } sub load_data_from_files { my ($dir,$suffix,@prefixes) = @_; opendir(DIR,"$dir") || die "Could not open directory $dir\n"; my @raw_list = grep(/$suffix/i, readdir(DIR)); closedir(DIR); foreach my $file ( @raw_list ) { foreach my $type ( @prefixes ) { if ( $file !~ /\A$type/ ) { next; } if ( $debug >= 1 ) { print "Loading data from file $file\n"; } load_matrix($file); } } print "\n"; return @raw_list; } sub load_matrix { my ($file) = @_; open(FILE,$file) or die "unable to open $file\n"; my $data = ; my @rows = split(/\cM/,$data); # fricken mac foreach my $row ( @rows ) { my @data = split(/$dl/,$row); if ( @data != 6 ) { die "$file row missing data: @data\n"; } my ($name,$min,$bp_pos,$height,$area,$ref) = @data; #$height = log( $height ); #$area = log( $area ); if ( $name =~ /\AB/ ) { # Assume BLUE = forward primer results # (Yellow = reverse primer) my $base_pair = roundUp($bp_pos); my $int = int($bp_pos); $matrix{$file}{$base_pair}{height} += $height; $matrix{$file}{total_height} += $height; $matrix{$file}{$base_pair}{area} += $area; $matrix{$file}{total_area} += $area; # always load height and area into two base pairs # assumes +/- 0.5 accuracy if ( $int == $base_pair && $int >= 1 ) { # below 0.5 my $early = $base_pair - 1; $matrix{$file}{$early}{height} += $height; $matrix{$file}{$early}{area} += $area; } else { # >= 0.5 my $late = $base_pair + 1; $matrix{$file}{$late}{height} += $height; $matrix{$file}{$late}{area} += $area; } } elsif ($name =~ /\AR/ ) { # Red = standards foreach my $standard ( (50,75,100,125,150,200,250,300,350,400,450) ) { if ( $bp_pos != $standard ) { next; } #if ( $height >= 50 && $matrix{$file}{std_count} < 10 ) { # min height for valid standard peak $matrix{$file}{std_height} += $height; $matrix{$file}{std_area} += $area; $matrix{$file}{std_count} += 1; #} } } } } sub normalize_within_sample { print "Normalizing within samples against reference standard peaks.\n\n"; foreach my $file ( reverse sort keys %matrix ) { my $count = $matrix{$file}{std_count} || 0; if ( $count != 10 ) { print "Skipping $file, found $count reference standards (requires 10).\n"; delete $matrix{$file}; next; # try next file } my $std_height_ave = $matrix{$file}{std_height} / 10; my $std_area_ave = $matrix{$file}{std_area} / 10; #delete $matrix{$file}{total_height}; #delete $matrix{$file}{total_area}; foreach my $bp ( sort{ $a <=> $b } keys %{$matrix{$file}} ) { if ( $bp !~ /\A\d+\Z/ ) { next; } # Normalize height based on standards height my $height = $matrix{$file}{$bp}{height}; $matrix{$file}{$bp}{height} = ($height / $std_height_ave) * 1000; $matrix{$file}{total_height} += $matrix{$file}{$bp}{height}; my $area = $matrix{$file}{$bp}{area}; $matrix{$file}{$bp}{area} = ($area / $std_area_ave) * 1000; # $matrix{$file}{total_area} += $matrix{$file}{$bp}{area}; } } } sub normalize_between_samples { # express each peak as a percentage of the total sample sum # effectively normalizes peaks between different samples # (assume each sample was from an equal amount of DNA) print "Normalizing between samples...\n\n"; foreach my $file ( reverse sort keys %matrix ) { my ($height,$area); foreach my $bp ( sort{ $a <=> $b } keys %{$matrix{$file}} ) { if ( $bp !~ /\A\d+\Z/ ) { next; } $area += $matrix{$file}{$bp}{area}; $height += $matrix{$file}{$bp}{height}; } $matrix{$file}{total_area} = $area; $matrix{$file}{total_height} = $height; } foreach my $file ( reverse sort keys %matrix ) { my $total_height = $matrix{$file}{total_height} or die "no total height for $file"; my $total_area = $matrix{$file}{total_area} or die "no total area for $file"; if ($debug == 1) { print "File $file total area = $total_area.\n"; } foreach my $bp ( sort{ $a <=> $b } keys %{$matrix{$file}} ) { if ( $bp !~ /\A\d+\Z/ ) { next; } my $height = $matrix{$file}{$bp}{height} or next; my $area = $matrix{$file}{$bp}{area} or next; $height = ($height / $total_height) * 100; $area = ($area / $total_area) * 100; if ( $debug >= 2 ) { print "$file $bp height = $height, area = $area\n"; } $matrix{$file}{$bp}{height} = $height; $matrix{$file}{$bp}{area} = $area; } my $sum = 0; foreach my $bp ( sort{ $a <=> $b } keys %{$matrix{$file}} ) { if ( $bp !~ /\A\d+\Z/ ) { next; } $sum += $matrix{$file}{$bp}{area}; } print "Sum area for $file is $sum\n"; } } sub filter_based_on_total_sample { my ($type,$max_scale_difference) = @_; my ($max_height,$max_area) = get_max_values(); print "Filtering out samples that are $max_scale_difference times smaller $type than the largest sample total.\n\n"; print "max height = $max_height, max area = $max_area\n"; foreach my $file ( reverse sort keys %matrix ) { my ($total,$max); if ( $type eq 'area' ) { $total = $matrix{$file}{total_area}; $max = $max_area; } else { $total = $matrix{$file}{total_height}; $max = $max_height; } $total or die "no total $type for $file"; my $scale = $max / $total; if ( $debug >= 1 ) { print "Scale $type = $scale for $file\n"; } if ( $scale > $max_scale_difference ) { print "$file Total sample $type too small, removing from the analysis.\n"; delete $matrix{$file}; } } } sub filter_within_each_group_based_on_total_sample { my ($type,$max_scale_difference) = @_; print "Filtering out samples that are $max_scale_difference times smaller $type than the largest sample total.\n\n"; foreach my $prefix ( @file_prefixes ) { my ($max_height,$max_area) = get_max_values($prefix); print "for $prefix files, max height = $max_height, max area = $max_area\n"; foreach my $file ( reverse sort keys %matrix ) { if ( $file !~ /\A$prefix/ ) { next; } my ($total,$max); if ( $type eq 'area' ) { $total = $matrix{$file}{total_area}; $max = $max_area; } else { $total = $matrix{$file}{total_height}; $max = $max_height; } $total or die "no total $type for $file"; my $scale = $max / $total; if ( $debug >= 1 ) { print "Scale $type = $scale for $file\n"; } if ( $scale > $max_scale_difference ) { print "$file Total sample $type too small, removing from the analysis.\n"; delete $matrix{$file}; } } } foreach my $prefix ( @file_prefixes ) { my @list; foreach my $file ( reverse sort keys %matrix ) { if ( $file !~ /\A$prefix/ ) { next; } push(@list,$file); } my $count = @list; print "$prefix includes $count files @list\n"; } } sub filter_groups_based_on_total { my ($type,$samples_needed) = @_; foreach my $prefix ( @file_prefixes ) { my %best; foreach my $file ( reverse sort keys %matrix ) { if ( $file !~ /\A$prefix/ ) { next; } if ( $type eq 'area' ) { $best{ $matrix{$file}{total_area} } = $file; } else { $best{ $matrix{$file}{total_height} } = $file; } } my @ordered = sort{ $a <=> $b } keys %best; my @small = splice(@ordered,$samples_needed,$#ordered); foreach my $sample ( @small ) { if ( -e $best{$sample} ) { print "Removing $best{$sample} from $prefix analysis (too small).\n"; delete $matrix{ $best{$sample} }; } } } print "\nTop $samples_needed files in each group based on $type:\n"; foreach my $prefix ( @file_prefixes ) { my @okay; foreach my $file ( reverse sort keys %matrix ) { if ( $file !~ /\A$prefix/ ) { next; } push(@okay,$file); } print "$prefix files: @okay.\n"; } } sub get_max_values { my $prefix = shift() || ''; my $max_area = 0; my $max_height = 0; foreach my $file ( keys %matrix ) { if ( $prefix && $file !~ /\A$prefix/ ) { next; } my $total_area = $matrix{$file}{total_area} || 0; my $total_height = $matrix{$file}{total_height} || 0; if ( $total_area > $max_area ) { $max_area = $total_area; } if ( $total_height > $max_height ) { $max_height = $total_height; } } return ($max_height,$max_area); } sub log_normalize_data { foreach my $file ( reverse sort keys %matrix ) { foreach my $bp ( sort{ $a <=> $b } keys %{$matrix{$file}} ) { if ( $bp !~ /\A\d+\Z/ ) { next; } $matrix{$file}{$bp}{height} = log( $matrix{$file}{$bp}{height} ); $matrix{$file}{$bp}{area} = log( $matrix{$file}{$bp}{area} ); } } } sub filter_group_based_on_duplicate_peaks { my $matches = shift; print "\nFiltering group matrix; only include peaks with $matches matches.\n\n"; foreach my $file ( keys %matrix ) { foreach my $prefix ( @file_prefixes ) { if ( $file !~ /\A$prefix/i ) { next; } foreach my $bp ( keys %{$matrix{$file}} ) { if ( $bp !~ /\A\d+\Z/ ) { next; } if ( exists $matrix{$file}{$bp} ) { $group_matrix{$prefix}{$bp}{hits}++; } } } } foreach my $prefix ( keys %group_matrix ) { foreach my $bp ( sort{ $a <=> $b } keys %{$group_matrix{$prefix}} ) { if ( $bp !~ /\A\d+\Z/ ) { next; } if ( $group_matrix{$prefix}{$bp}{hits} < $matches ) { delete $group_matrix{$prefix}{$bp}; } else { if ( $debug >= 2 ) { print "$prefix files had $group_matrix{$prefix}{$bp}{hits} hits at $bp\n"; } } } } } sub average_each_treatment_group { my @prefixes = @_; print "\nAveraging each treatment group for groups: @prefixes\n"; foreach my $file ( keys %matrix ) { foreach my $prefix ( @prefixes ) { if ( $file !~ /\A$prefix/i ) { next; } sum_group($prefix,$file); $group_matrix{$prefix}{members}++; } } foreach my $prefix ( keys %group_matrix ) { my $divisor = $group_matrix{$prefix}{members} or die "$prefix has no members\n"; foreach my $bp ( keys %{$group_matrix{$prefix}} ) { if ( $bp !~ /\A\d+\Z/ ) { next; } if ( $group_matrix{$prefix}{$bp}{area} ) { $group_matrix{$prefix}{$bp}{height} /= $divisor; $group_matrix{$prefix}{$bp}{area} /= $divisor; } } } } sub sum_group { my ($prefix,$file) = @_; foreach my $bp ( keys %{$matrix{$file}} ) { if ( $bp !~ /\A\d+\Z/ ) { next; } $group_matrix{$prefix}{$bp}{height} += $matrix{$file}{$bp}{height}; $group_matrix{$prefix}{total_height} += $matrix{$file}{$bp}{height}; $group_matrix{$prefix}{$bp}{area} += $matrix{$file}{$bp}{area}; push(@{$group_matrix{$prefix}{$bp}{values}},$matrix{$file}{$bp}{area}); $group_matrix{$prefix}{total_area} += $matrix{$file}{$bp}{area}; } } sub subtract_soil_control { foreach my $prefix ( keys %matrix ) { if ( $prefix =~ /\Ascntl/ ) { delete $matrix{$prefix}; next; } foreach my $bp ( keys %{$matrix{$prefix}} ) { if ( $bp !~ /\A\d+\Z/ ) { next; } if ( $matrix{$prefix}{$bp}{area} ) { $matrix{$prefix}{$bp}{height} -= $group_matrix{'scntl'}{$bp}{height}; $matrix{$prefix}{$bp}{area} -= $group_matrix{'scntl'}{$bp}{area}; if ( $matrix{$prefix}{$bp}{height} < 0 ) { delete $matrix{$prefix}{$bp}{height}; } if ( $matrix{$prefix}{$bp}{area} < 0 ) { delete $matrix{$prefix}{$bp}{area}; } } } } } sub limit_peaks_per_sample { my $target_peaks = shift; print "\nLimiting peaks per sample to the largest $target_peaks based on area.\n\n"; my $type = 'area'; foreach my $file ( keys %matrix ) { my %ordered; foreach my $bp ( keys %{$matrix{$file}} ) { if ( $bp !~ /\A\d+\Z/ ) { next; } my $value = $matrix{$file}{$bp}{$type}; $ordered{$value} = $bp; } my @ordered = reverse sort{ $a <=> $b } keys %ordered; my $count = @ordered; print "$file had $count peaks, limiting to top $target_peaks...\n"; my @result = splice(@ordered,0,$target_peaks); # print "result = @result\n"; my %peaks; foreach my $value ( @result ) { print "value = $value, bp = $ordered{$value}\n"; $peaks{ $ordered{$value} } = $value; } foreach my $bp ( keys %{$matrix{$file}} ) { if ( ! exists $peaks{$bp} ) { delete $matrix{$file}{$bp}; } # else { print "saving $bp\n"; } } } } sub filter_peaks_min_percent { my ($type,$percent) = @_; print "\nFiltering peaks based on $type to $percent percent of total sample $type\n\n"; foreach my $file ( sort keys %matrix ) { my ($peaks,%peaks) = filter_peaks($file,$type,$percent); my @peaks = sort { $a <=> $b } keys %peaks; print "$file had $peaks peaks: @peaks.\n"; } } sub filter_peaks { my ($file,$type,$percent) = @_; # $percent = ($percent / 100); my $total; if ( $type eq 'area' ) { $total = $matrix{$file}{total_area}; } else { $total = $matrix{$file}{total_height}; } $total or next; foreach my $bp ( keys %{$matrix{$file}} ) { if ( $bp !~ /\A\d+\Z/ ) { next; } if ( $matrix{$file}{$bp}{$type} < $percent ) { delete $matrix{$file}{$bp}; } } my $peaks = 0; my %peaks; foreach my $bp ( keys %{$matrix{$file}} ) { if ( $bp !~ /\A\d+\Z/ ) { next; } if ( $matrix{$file}{$bp}{$type} ) { $peaks++; $peaks{$bp} = 1; } } return ($peaks,%peaks); } sub print_matrix { use Data::Dumper; # simple procedural interface print Dumper(%matrix); } sub show_all_peaks { print "\n"; foreach my $file ( sort keys %matrix ) { my %peaks; foreach my $bp ( keys %{$matrix{$file}} ) { if ( $bp !~ /\A\d+\Z/ ) { next; } $peaks{$bp} = $matrix{$file}{$bp}{area}; } my @peaks = sort { $a <=> $b } keys %peaks; my $peaks = @peaks; print "$file has $peaks peaks: @peaks.\n"; } } sub check_valid_peak_percentage_sum { print "Checking valid peak percentage sum...\n"; foreach my $file ( sort keys %matrix ) { my $area = 0; foreach my $bp ( sort { $a <=> $b } keys %{$matrix{$file}} ) { if ( $bp !~ /\A\d+\Z/ ) { next; } $area += $matrix{$file}{$bp}{area}; # print "$bp = " . $matrix{$file}{$bp}{area} . "\n"; } print "The area sum for $file is $area.\n"; } } sub list_top_peak_range { my $target_peaks = shift; print "Listing values for top $target_peaks peaks:\n"; my %ordered; foreach my $file ( sort keys %matrix ) { foreach my $bp ( keys %{$matrix{$file}} ) { if ( $bp !~ /\A\d+\Z/ ) { next; } my $value = $matrix{$file}{$bp}{area}; $base_pairs{$bp}{$file} = $value; $ordered{$value} = $bp; } } # get the largest peaks in all samples my @ordered = reverse sort{ $a <=> $b } keys %ordered; my $count = @ordered; my @big_values = splice(@ordered,0,$target_peaks); print "Largest values = @big_values\n\n"; my @big_peaks; foreach my $value ( @big_values ) { push(@big_peaks,$ordered{$value}); } foreach my $bp ( sort { $a <=> $b } @big_peaks ) { foreach my $group ( sort keys %{$base_pairs{$bp}} ) { print "\t$bp $group = $base_pairs{$bp}{$group}"; my @values = @{$matrix{$group}{$bp}{values}}; print "[ @values ]\n"; } print "\n"; } } sub show_matching_mica_bacteria_list { load_mica_db('mica_db.txt'); my $count = 0; my %list; foreach my $file ( sort keys %matrix ) { my @bacteria_list; for (my $i=10; $i <= 500; $i++) { if ( exists( $matrix{$file}{$i}{area} ) ){ if ( exists $mica_db{$i} || exists $mica_db{$i-1} || exists $mica_db{$i+1} ) { foreach my $entry ( keys %{$mica_db{$i}} ) { push(@bacteria_list,"$i : $entry = $mica_db{$i}{$entry}"); } } } } $list{$file} = \@bacteria_list; } foreach my $file ( sort keys %matrix ) { my @bacteria = @{$list{$file}}; print "\n$file:\n" . join("\n\t",@bacteria); } } sub load_mica_db { my ($file) = @_; open(FILE,$file) or die "unable to open $file\n"; my $count = 0; foreach my $line () { if ( $line =~ /\A\d+/ ) { chomp($line); $line =~ s/\"//g; my ($for,$rev,$Accession,$ref,$desc) = split(/,/,$line); if ( ($for + $rev > 515) && ($for + $rev < 530) ) { $mica_db{$for}{$ref} = "$desc ($Accession)"; $count++; } } } if ( $debug >= 2 ) { foreach my $bp ( sort { $a <=> $b } keys %mica_db ) { foreach my $entry ( sort { $a <=> $b } keys %{$mica_db{$bp}} ) { print "$entry = $mica_db{$bp}{$entry}\n"; } } } print "\nTotal of $count valid entries in Mica database.\n\n"; } sub roundUp { my $value = shift; return int( $value + 0.5 ); }