#!/usr/bin/perl # # Copyright: Alvaro Gonzalez & Li Liao 2007 # Computer and Information Sciences, University of Delaware # # Usage: # perl /home/alvaro/Assembly_Project/MUMmer/Test_Alignments/nucmer_weighted_linear_regression.pl # -i input_file (e.g. /home/alvaro/Assembly_Project/MUMmer/Test_Alignments/nucmer_BAC03-ABI2X454-AllContigs_to_Jap.cluster) # -o alignments' output file (e.g. /home/alvaro/Assembly_Project/MUMmer/Test_Alignments/nucmer_BAC03-ABI2X454-AllContigs_to_Jap_algmt) # -e percentage of allowed error (e.g. 3, meaning 3%) # -t temporal_file (e.g. /home/alvaro/Assembly_Project/MUMmer/Test_Alignments/temp_file) # -s scores_file (e.g. /home/alvaro/Assembly_Project/MUMmer/Test_Alignments/scores) # -d database_name # -u database_user_name # -p database_password # The input file is the output from nucmer (the .cluster file). Nucmer should # be run such that no extensions are allowed (use -nodelta). This script first # parses the .cluster file from Nucmer, which contains exact matches of each # contig in the query file as it is aligned to a reference genome. When the # coordinates of all the seeds have been parsed, they're read into a mysql table # in function cluster_process, where they are sorted and processed according # to the weighted linear regression algorithm. The output of this function is # a pair of coordinates, start and end, which represent the region where the # contig is most likely to align to the reference genome. A score is also given # to each alignment; this score also appears in the output. use DBI; # The next module and the function "getopt" help to parse input arguments. use Getopt::Std; getopt("ioetsdup", \%args); $opt_i = $args{i}; $opt_o = $args{o}; $opt_e = $args{e}; $opt_t = $args{t}; $opt_s = $args{s}; $opt_d = $args{d}; $opt_u = $args{u}; $opt_p = $args{p}; open (CLUSTERS, $opt_i); open (OUTPUT, ">".$opt_o); open (OUTPUT_D, ">".$opt_o."_det"); #open (OUTPUT_M, ">".$opt_o."_matlab"); open (SCORES, ">".$opt_s); $state = 0; @clusters_file = ; chomp(@clusters_file); foreach $clusters_line (@clusters_file) { if ($state == 0) # Initial State { if ($clusters_line =~ /^>/) { open (TEMP, ">".$opt_t); @dum = split(/ /, $clusters_line); $subject_id = substr($dum[0], 1); $query_id = $dum[1]; #$cluster_number = substr($query_id, 66, (length($query_id)-66-1)); $subject_length = $dum[2]; $query_length = $dum[3]; $state = 1; } } elsif ($state == 1) # Read alignment header { if ($clusters_line =~ /1 1/) { $state = 2; } elsif ($clusters_line =~ /1 -1/) { $state = 3; } } elsif ($state == 2) # Read cluster header, forward direction { @dum = split(/ +/, $clusters_line); if ($#dum == 5) #The first element of @dum is empty { $x1 = $dum[1]; $x2 = $dum[1] + $dum[3] - 1; $y1 = $dum[2]; $y2 = $dum[2] + $dum[3] - 1; } else { $x1 = $dum[0]; $x2 = $dum[0] + $dum[2] - 1; $y1 = $dum[1]; $y2 = $dum[1] + $dum[2] - 1; } $slope = ($y2 - $y1)/($x2 - $x1); $y_intersect = $y1 - $slope*$x1; $weight = $x2 - $x1; print (TEMP $x1."\t".$x2."\t".$y1."\t".$y2."\t".$slope."\t".$y_intersect."\t".$weight."\t"."0"."\n"); $state = 4; } elsif ($state == 3) # Read cluster header, backward direction { @dum = split(/ +/, $clusters_line); if ($#dum == 5) #The first element of @dum is empty { $x1 = $dum[1]; $x2 = $dum[1] + $dum[3] - 1; $y1 = $dum[2]; $y2 = $dum[2] - $dum[3] + 1; } else { $x1 = $dum[0]; $x2 = $dum[0] + $dum[2] - 1; $y1 = $dum[1]; $y2 = $dum[1] - $dum[2] + 1; } $slope = ($y2 - $y1)/($x2 - $x1); $y_intersect = $y1 - $slope*$x1; $weight = $x2 - $x1; print (TEMP $x1."\t".$x2."\t".$y1."\t".$y2."\t".$slope."\t".$y_intersect."\t".$weight."\t"."0"."\n"); $state = 5; } elsif ($state == 4) # Read line, forward direction { if ($clusters_line =~ /^>/) { $start_ref = 0; $end_ref = 0; $start_que = 0; $end_que = 0; $matched = 0; $matched_in_str = 0; $matched_in_clu = 0; $strand = 0; $clu_y_interc = 0; $y_interc_mean = 0; $y_interc_std = 0; $error = $query_length*($opt_e/100); close (TEMP); cluster_process(\$start_ref, \$end_ref, \$start_que, \$end_que, \$matched, \$matched_in_str, \$matched_in_clu, \$strand, \$clu_y_interc, \$y_interc_mean, \$y_interc_std, $error, $opt_t, $opt_d, $opt_u, $opt_p); system("rm ".$opt_t); $perc_matched = ($matched/$query_length)*100; $perc_matched_in_str = ($matched_in_str/$query_length)*100; $perc_matched_in_clu = ($matched_in_clu/$query_length)*100; $lgth_ref = $end_ref - $start_ref; $lgth_que = $end_que - $start_que; if ($lgth_ref < $lgth_que) { $score = (100/4)*($matched/$query_length + $matched_in_str/$matched + $matched_in_clu/$matched_in_str + $lgth_ref/$lgth_que); } else { $score = (100/4)*($matched/$query_length + $matched_in_str/$matched + $matched_in_clu/$matched_in_str + $lgth_que/$lgth_ref); } print (OUTPUT $subject_id."\t".$query_id."\t".$start_ref."\t".$end_ref."\t".$score."\n"); print (OUTPUT_D $subject_id."\t".$query_id."\t".$start_ref."\t".$end_ref."\t".$score."\t".$strand."\t".$query_length."\t".$lgth_que."\t".$lgth_ref."\t".$perc_matched."\t".$perc_matched_in_str."\t".$perc_matched_in_clu."\t".$clu_y_interc."\t".$y_interc_mean."\t".$y_interc_std."\n"); #print (OUTPUT_M $start_ref."\t".$end_ref."\t".$score."\t".$cluster_number."\n"); print (SCORES $score."\n"); open (TEMP, ">".$opt_t); @dum = split(/ /, $clusters_line); $subject_id = substr($dum[0], 1); $query_id = $dum[1]; #$cluster_number = substr($query_id, 66, (length($query_id)-66-1)); $subject_length = $dum[2]; $query_length = $dum[3]; $state = 1; } elsif ($clusters_line =~ /1 1/) { $state = 2; } elsif ($clusters_line =~ /1 -1/) { $state = 3; } else { @dum = split(/ +/, $clusters_line); if ($#dum == 5) #The first element of @dum is empty { $x1 = $dum[1]; $x2 = $dum[1] + $dum[3] - 1; $y1 = $dum[2]; $y2 = $dum[2] + $dum[3] - 1; } else { $x1 = $dum[0]; $x2 = $dum[0] + $dum[2] - 1; $y1 = $dum[1]; $y2 = $dum[1] + $dum[2] - 1; } $slope = ($y2 - $y1)/($x2 - $x1); $y_intersect = $y1 - $slope*$x1; $weight = $x2 - $x1; print (TEMP $x1."\t".$x2."\t".$y1."\t".$y2."\t".$slope."\t".$y_intersect."\t".$weight."\t"."0"."\n"); $state = 4; } } elsif ($state == 5) # Read line, backward direction { if ($clusters_line =~ /^>/) { $start_ref = 0; $end_ref = 0; $start_que = 0; $end_que = 0; $matched = 0; $matched_in_str = 0; $matched_in_clu = 0; $strand = 0; $clu_y_interc = 0; $y_interc_mean = 0; $y_interc_std = 0; $error = $query_length*($opt_e/100); close (TEMP); cluster_process(\$start_ref, \$end_ref, \$start_que, \$end_que, \$matched, \$matched_in_str, \$matched_in_clu, \$strand, \$clu_y_interc, \$y_interc_mean, \$y_interc_std, $error, $opt_t, $opt_d, $opt_u, $opt_p); system("rm ".$opt_t); $perc_matched = ($matched/$query_length)*100; $perc_matched_in_str = ($matched_in_str/$query_length)*100; $perc_matched_in_clu = ($matched_in_clu/$query_length)*100; $lgth_ref = $end_ref - $start_ref; $lgth_que = $end_que - $start_que; if ($lgth_ref < $lgth_que) { $score = (100/4)*($matched/$query_length + $matched_in_str/$matched + $matched_in_clu/$matched_in_str + $lgth_ref/$lgth_que); } else { $score = (100/4)*($matched/$query_length + $matched_in_str/$matched + $matched_in_clu/$matched_in_str + $lgth_que/$lgth_ref); } print (OUTPUT $subject_id."\t".$query_id."\t".$start_ref."\t".$end_ref."\t".$score."\n"); print (OUTPUT_D $subject_id."\t".$query_id."\t".$start_ref."\t".$end_ref."\t".$score."\t".$strand."\t".$query_length."\t".$lgth_que."\t".$lgth_ref."\t".$perc_matched."\t".$perc_matched_in_str."\t".$perc_matched_in_clu."\t".$clu_y_interc."\t".$y_interc_mean."\t".$y_interc_std."\n"); #print (OUTPUT_M $start_ref."\t".$end_ref."\t".$score."\t".$cluster_number."\n"); print (SCORES $score."\n"); open (TEMP, ">".$opt_t); @dum = split(/ /, $clusters_line); $subject_id = substr($dum[0], 1); $query_id = $dum[1]; #$cluster_number = substr($query_id, 66, (length($query_id)-66-1)); $subject_length = $dum[2]; $query_length = $dum[3]; $state = 1; } elsif ($clusters_line =~ /1 1/) { $state = 2; } elsif ($clusters_line =~ /1 -1/) { $state = 3; } else { @dum = split(/ +/, $clusters_line); if ($#dum == 5) #The first element of @dum is empty { $x1 = $dum[1]; $x2 = $dum[1] + $dum[3] - 1; $y1 = $dum[2]; $y2 = $dum[2] - $dum[3] + 1; } else { $x1 = $dum[0]; $x2 = $dum[0] + $dum[2] - 1; $y1 = $dum[1]; $y2 = $dum[1] - $dum[2] + 1; } $slope = ($y2 - $y1)/($x2 - $x1); $y_intersect = $y1 - $slope*$x1; $weight = $x2 - $x1; print (TEMP $x1."\t".$x2."\t".$y1."\t".$y2."\t".$slope."\t".$y_intersect."\t".$weight."\t"."0"."\n"); $state = 5; } } } $start_ref = 0; $end_ref = 0; $start_que = 0; $end_que = 0; $matched = 0; $matched_in_str = 0; $matched_in_clu = 0; $strand = 0; $clu_y_interc = 0; $y_interc_mean = 0; $y_interc_std = 0; $error = $query_length*($opt_e/100); close (TEMP); cluster_process(\$start_ref, \$end_ref, \$start_que, \$end_que, \$matched, \$matched_in_str, \$matched_in_clu, \$strand, \$clu_y_interc, \$y_interc_mean, \$y_interc_std, $error, $opt_t, $opt_d, $opt_u, $opt_p); system("rm ".$opt_t); $perc_matched = ($matched/$query_length)*100; $perc_matched_in_str = ($matched_in_str/$query_length)*100; $perc_matched_in_clu = ($matched_in_clu/$query_length)*100; $lgth_ref = $end_ref - $start_ref; $lgth_que = $end_que - $start_que; if ($lgth_ref < $lgth_que) { $score = (100/4)*($matched/$query_length + $matched_in_str/$matched + $matched_in_clu/$matched_in_str + $lgth_ref/$lgth_que); } else { $score = (100/4)*($matched/$query_length + $matched_in_str/$matched + $matched_in_clu/$matched_in_str + $lgth_que/$lgth_ref); } print (OUTPUT $subject_id."\t".$query_id."\t".$start_ref."\t".$end_ref."\t".$score."\n"); print (OUTPUT_D $subject_id."\t".$query_id."\t".$start_ref."\t".$end_ref."\t".$score."\t".$strand."\t".$query_length."\t".$lgth_que."\t".$lgth_ref."\t".$perc_matched."\t".$perc_matched_in_str."\t".$perc_matched_in_clu."\t".$clu_y_interc."\t".$y_interc_mean."\t".$y_interc_std."\n"); #print (OUTPUT_M $start_ref."\t".$end_ref."\t".$score."\t".$cluster_number."\n"); print (SCORES $score."\n"); close (CLUSTERS); close (OUTPUT); close (OUTPUT_D); #close (OUTPUT_M); close (SCORES); ############################################################################ # Processing of Clusters # ############################################################################ sub cluster_process { # Initial Database Required Parameters my $database_name = $_[13]; my $db_server = "localhost"; my $db_port = "3306"; my $db_user = $_[14]; my $db_password = $_[15]; my $databaseString = "DBI:mysql:$database_name:$db_server:$db_port"; my $dbh = DBI->connect($databaseString, $db_user, $db_password) or die "Can't connect to RDBMS: $DBI::errstr\n"; my $dbh_ref = \$dbh; my @select_matrix = (); my $mat_ref = \@select_matrix; my @select_matrix2 = (); my $mat_ref2 = \@select_matrix2; my $start_ref = $_[0]; my $end_ref = $_[1]; my $start_que = $_[2]; my $end_que = $_[3]; my $matched = $_[4]; my $matched_in_str = $_[5]; my $matched_in_clu = $_[6]; my $strand = $_[7]; my $clu_y_interc = $_[8]; my $y_interc_mean = $_[9]; my $y_interc_std = $_[10]; my $error_marg = $_[11]; my $temp_file = $_[12]; exec_DB_command($dbh_ref, "DROP TABLE IF EXISTS temp1;"); exec_DB_command($dbh_ref, "CREATE TABLE temp1 (x1 INT, x2 INT, y1 INT, y2 INT, slope INT, b INT, weight INT);"); exec_DB_command($dbh_ref, "LOAD DATA LOCAL INFILE '".$temp_file."' INTO TABLE temp1;"); # Find the strand of the main alignment my $main_slope = 0; exec_DB_select($dbh_ref, $mat_ref, "SELECT SUM(weight) FROM temp1 WHERE slope = 1;"); my $w_str_lgth = $select_matrix[0][0]; exec_DB_select($dbh_ref, $mat_ref, "SELECT SUM(weight) FROM temp1 WHERE slope = -1;"); my $c_str_lgth = $select_matrix[0][0]; if (!defined($w_str_lgth)) { $main_slope = -1; ${$matched} = $c_str_lgth; } elsif (!defined($c_str_lgth)) { $main_slope = 1; ${$matched} = $w_str_lgth; } else { if ($w_str_lgth > $c_str_lgth) { $main_slope = 1; } else { $main_slope = -1; } ${$matched} = $w_str_lgth + $c_str_lgth; } ${$strand} = $main_slope; exec_DB_command($dbh_ref, "DROP TABLE IF EXISTS temp2;"); exec_DB_command($dbh_ref, "CREATE TABLE temp2 SELECT b, SUM(weight) AS weight FROM temp1 WHERE slope = ".$main_slope." GROUP BY b;"); exec_DB_command($dbh_ref, "ALTER TABLE temp2 ADD COLUMN filtered INT;"); exec_DB_select($dbh_ref, $mat_ref, "SELECT SUM(weight) FROM temp2;"); ${$matched_in_str} = $select_matrix[0][0]; exec_DB_select($dbh_ref, $mat_ref, "SELECT b, weight FROM temp2;"); my $m = 0; my $b = 0; my $low_range = 0; my $high_range = 0; my $filtered = 0; for ($m = 0; $m < scalar(@select_matrix); $m++) { $b = $select_matrix[$m][0]; if ($b eq '') { $b = 0; } $low_range = $b - $error_marg; $high_range = $b + $error_marg; exec_DB_select($dbh_ref, $mat_ref2, "SELECT SUM(weight) FROM temp2 WHERE b BETWEEN ".$low_range." AND ".$high_range.";"); $filtered = $select_matrix2[0][0]; exec_DB_command($dbh_ref, "UPDATE temp2 SET filtered = ".$filtered." WHERE b = ".$b.";"); } exec_DB_select($dbh_ref, $mat_ref, "SELECT b, filtered FROM temp2 ORDER BY filtered DESC LIMIT 1;"); ${$clu_y_interc} = $select_matrix[0][0]; $low_range = ${$clu_y_interc} - $error_marg; $high_range = ${$clu_y_interc} + $error_marg; exec_DB_select($dbh_ref, $mat_ref, "SELECT SUM(weight) FROM temp2 WHERE b BETWEEN ".$low_range." AND ".$high_range.";"); ${$matched_in_clu} = $select_matrix[0][0]; exec_DB_select($dbh_ref, $mat_ref, "SELECT x1, y1 FROM temp1 WHERE b BETWEEN ".$low_range." AND ".$high_range." ORDER BY x1 LIMIT 1;"); ${$start_ref} = $select_matrix[0][0]; if ($main_slope == 1) { ${$start_que} = $select_matrix[0][1]; } else { ${$end_que} = $select_matrix[0][1]; } exec_DB_select($dbh_ref, $mat_ref, "SELECT x2, y2 FROM temp1 WHERE b BETWEEN ".$low_range." AND ".$high_range." ORDER BY x2 DESC LIMIT 1;"); ${$end_ref} = $select_matrix[0][0]; if ($main_slope == 1) { ${$end_que} = $select_matrix[0][1]; } else { ${$start_que} = $select_matrix[0][1]; } exec_DB_command($dbh_ref, "DROP TABLE IF EXISTS temp3;"); exec_DB_command($dbh_ref, "CREATE TABLE temp3 SELECT b*weight AS moment, b*b*weight AS moment2 FROM temp1;"); exec_DB_select($dbh_ref, $mat_ref, "SELECT SUM(moment), SUM(moment2) FROM temp3;"); ${$y_interc_mean} = $select_matrix[0][0]/${$matched}; ${$y_interc_std} = sqrt($select_matrix[0][1]/${$matched}); exec_DB_command($dbh_ref, "DROP TABLE temp1;"); exec_DB_command($dbh_ref, "DROP TABLE temp2;"); exec_DB_command($dbh_ref, "DROP TABLE temp3;"); } ############################################################################ # Function for Inserts, Udates and Deletes # ############################################################################ sub exec_DB_command { my $dbh = ${$_[0]}; my $sql_statement = $_[1]; my $sth = $dbh->prepare($sql_statement); $sth->execute() or die "Can't execute SQL statement: $dbh->errstr"; $sth->finish; } ############################################################################ # Function for Selects # ############################################################################ sub exec_DB_select { my $dbh = ${$_[0]}; my $mat_ref = $_[1]; my $sql_statement = $_[2]; my $sth = $dbh->prepare($sql_statement); my $array_ref; my $record; my $m = 0; my $n = 0; $sth->execute() or die "Can't execute SQL statement: $dbh->errstr"; @$mat_ref = (); while ($array_ref = $sth->fetchrow_arrayref) { while ($record = $array_ref->[$n]) { $mat_ref->[$m]->[$n] = $record; $n++; } $n = 0; $m++; } $sth->finish; }