#!/usr/bin/perl -w =head1 NAME compare-defs.pl - compare term defs between two OBO files =head1 SYNOPSIS compare-defs.pl --file_1 old_gene_ontology.obo --file_2 gene_ontology.obo -o results.txt =head1 DESCRIPTION Compares the defs in two OBO files and records the differences between them =head2 Input parameters =head3 Required =over =item -o || --output /path/to/file_name output file for results =back =head3 Configuration options =over =item Comparing two existing files Enter the two files using the following syntax: -f1 /path/to/file_name -f2 /path/to/file_2_name where f1 is the "old" ontology file and f2 is the "new" file =item Comparing ontology files from two different dates Enter the file and the two dates using this syntax: -f /path/to/file -d1 "date one here" -d2 "date two here" The dates must be in a CVS-parseable form, e.g. "01 Dec 2010" or "2010-01-30" (YYYY-MM-DD) If -f is left blank, the default file used is /go/ontology/editors/gene_ontology_write.obo If d2 is omitted, the most recent version of file I will be used. =item Comparing ontology files by revision number Enter the file and the two revisions using this syntax: -f /path/to/file -r1 1.2345 -r2 1.2346 The dates must be in a CVS-parseable form, e.g. "01 Dec 2010" or "2010-01-30" (YYYY-MM-DD) If -f is left blank, the default file used is /go/ontology/editors/gene_ontology_write.obo If r2 is omitted, the most recent version of file I will be used. =back =head3 Optional switches =over =item -c || --db_counts Get annotation counts for any terms that have changed from a database. Note that the appropriate DB connection parameters should be entered into the "defaults" at the top of the file. =item -m || --mode Choose the file format for output, either text or html. Defaults to HTML =item -v || --verbose prints various messages during the execution of the script =back =cut use strict; use Data::Dumper; $Data::Dumper::Sortkeys = 1; my $bin_dir; my $dist_dir; BEGIN { use Cwd; use File::Basename; $bin_dir = dirname(__FILE__); $bin_dir = Cwd::abs_path($bin_dir); ($dist_dir = $bin_dir) =~ s/bin\/?$//; } use lib ($dist_dir, $bin_dir); package main; use strict; use Template; use File::Temp; use MIME::Lite; use GOBO::Logger; use Text::WordDiff; use GOBO::FileCompareExtras; use DateTime::Format::Strptime; my $logger; ## read in our config my $defaults = GOBO::FileCompareExtras::get_config( $dist_dir ); if (! $defaults) { die("No default settings found. Dying"); } run_script($defaults, \@ARGV); exit(0); sub run_script { my $options = parse_options(@_); $logger->info( "Parsed options. Now starting script..." ); ## quick 'diff' check of whether the files are identical or not my $cmd = "diff -w -q -i '" . $options->{f1} . "' '" . $options->{f2} . "'"; my $status = `$cmd`; my $identical; ## the files are identical! if (! $status) { ## if we're reporting identical files and the output is html or txt, carry on if ($options->{report_identical} && ($options->{mode}{html} || $options->{mode}{txt})) { $identical++; $logger->warn("The two files specified appear to be identical"); } ## otherwise, die else { $logger->logdie("The two files specified appear to be identical!"); } } my $t_args = { INCLUDE_PATH => $options->{inc_path}, }; my $tt = Template->new($t_args) || $logger->logdie("$Template::ERROR"); my $data; my $output; # open(OUT, ">" . $options->{'output'}) or die("Could not create " . $options->{output} . " : $!"); open(FH, "<" . $options->{'f1'}) or die("Could not open f1, " . $options->{'f1'} . "! $!"); my @arr; # remove and parse the header { local $/ = "\n["; @arr = grep { /(^date: | cvs version)/i } split("\n", ); ## date: 04:01:2011 16:56 ## remark: cvs version: $Revision: 1.1692 $ foreach (@arr) { if ($_ =~ /date: (.+)$/) { $data->{f1_date} = $1; } elsif ($_ =~ /cvs version: \$Revision: (\S+)/) { $data->{f1_cvs} = $1; } } if ( ! $data->{f1_date} || ! $data->{f1_cvs} ) { $logger->warn("Could not find the data or cvs version of f1, " . $options->{'f1'}); } if (! $identical) { #$logger->info( "Parsed $f header; starting body" ); my @lines; { local $/ = "\n["; while () { if (/^(\S+)\]\s*.*?^id:\s*(\S+)/sm) { # extract the interesting data if ($1 eq "Term") { my $h; map { if (/(.*?): ?(.+)( ?\!.*)?/) { $h->{$1} = $2; } } grep { /^(id|name|def|is_obsolete):/ } split("\n", $_); if ($h->{def}) { ## clip off the def xrefs if ($h->{def} =~ /^\"(.*)\"\s*(\[.*)/) { $h->{def} = $1; $h->{simple_def} = lc($h->{def}); $h->{simple_def} =~ s/[^a-z0-9 ]//g; } else { $logger->warn("Could not parse def for " . $h->{id}); } } $data->{f1}{ $h->{id} } = $h; } } } } } } close(FH); $logger->info( "Parsed " . $options->{f1} ); open(FH, "<" . $options->{'f2'}) or die("Could not open f2, " . $options->{'f2'} . "! $!"); # remove and parse the header { local $/ = "\n["; @arr = grep { /(^date: | cvs version)/i } split("\n", ); foreach (@arr) { if ($_ =~ /date: (.+)$/) { $data->{f2_date} = $1; } elsif ($_ =~ /cvs version: \$Revision: (\S+)/) { $data->{f2_cvs} = $1; } } if (! $data->{f2_date} || ! $data->{f2_cvs}) { warn "Could not find the data or cvs version of f2, " . $options->{'f2'}; } if (! $identical) { # $logger->info( "Parsed $f header; starting body" ); my @lines; { local $/ = "\n["; while () { if (/^(\S+)\]\s*.*?^id:\s*(\S+)/sm) { # extract the interesting data if ($1 eq "Term") { my $h; map { if (/(.*?): ?(.+)( ?\!.*)?/) { $h->{$1} = $2; } } grep { /^(id|name|def|is_obsolete):/ } split("\n", $_); if ($h->{def}) { ## clip off the def xrefs if ($h->{def} =~ /^\"(.*)\"\s*(\[.*)/) { $h->{def} = $1; $h->{simple_def} = lc($h->{def}); $h->{simple_def} =~ s/[^a-z0-9 ]//g; } else { warn "Could not parse def for " . $h->{id}; } } if ($data->{f1}{ $h->{id} }) { ## existing term if ($data->{f1}{$h->{id}}{simple_def} && $h->{simple_def} && $h->{simple_def} ne $data->{f1}{ $h->{id} }{simple_def}) { if ($h->{is_obsolete}) { $logger->info( "Got an obsolete term!" ); $h->{simple_def} =~ s/^obsolete\s*//; if ($h->{simple_def} eq $data->{f1}{$h->{id}}{simple_def}) { ## term has been obsoleted. Don't show. $logger->info("Ignoring obsolete term " . $h->{id}); } else { $logger->info("Including obsolete term " . $h->{id}); $data->{changed}{$h->{id}}++; $data->{f2}{$h->{id}} = $h; } } else { $data->{changed}{$h->{id}}++; $data->{f2}{$h->{id}} = $h; } } } } } } } } } close(FH); $logger->info("Finished parsing files!"); if ($identical) { ## the files are identical. Prepare for output! output_data( options => $options, output => { data => $data, %{$defaults->{html}} }, tt => $tt ); exit(0); } foreach (keys %{$data->{changed}}) { my $diff = word_diff(\$data->{f1}{$_}{def}, \$data->{f2}{$_}{def}, { STYLE => 'HTMLTwoLinesLite' }); if ($diff) { if ($diff =~ /div class="file"/) { $diff =~ s/(<\/?)div/$1span/gm; my @arr = split(/<\/span>\s*{f1}{$_}{def_diffs} = $arr[0] . ""; $data->{f2}{$_}{def_diffs} = '\s*{f1}{$_}{def_diffs} = $arr[0] . ""; $data->{f2}{$_}{def_diffs} = 'info( "Parsed " . $options->{f2} . "\nGathering results for printing..."); if (! $data->{changed}) { $output->{no_ontology_changes} = 1; # close OUT; # die("No changed definitions were found in the files specified."); } ## optional: connect to a db to find annotations if ($options->{db_counts}) { get_db_counts( options => $options, data => $data ); } $output->{data} = $data; ## clean up our mess if ($options->{f_moved}) { ## move this back my $cmd = "cp " . $options->{f_moved} . " " . $options->{f}; `$cmd`; } ## email / rss reports: only if there have been changes if ($options->{mode}{email} || $options->{mode}{rss}) { if ($data->{changed}) { ## generate the date my $parser = DateTime::Format::Strptime->new(pattern => "%d:%m:%Y %H:%M"); my $date; ## get the date from the header of f2 if ($data->{f2_date}) { $date = $parser->parse_datetime( $data->{f2_date} ); } else { $date = DateTime->now(); } $output->{date_object} = $date; $output->{full_date} = $date->strftime("%a, %d %b %Y %H:%M:%S %z"), $output->{nice_date} = join(" ", $date->day, $date->month_abbr, $date->year); } if ($options->{mode}{rss}) { create_rss( options => $options, output => { %$output, %{$defaults->{html}} }, tt => $tt ) if $data->{changed}; } if ($options->{mode}{email}) { create_email( options => $options, output => { %$output, %{$defaults->{html}} }, tt => $tt ) if $data->{changed}; } delete $options->{mode}{rss}; delete $options->{mode}{email}; } output_data( options => $options, output => { %$output, %{$defaults->{html}} }, tt => $tt ); } sub output_data { my %args = (@_); my $tt = $args{tt}; $logger->info("options: " . Dumper($args{options})); foreach my $m (keys %{$args{options}->{mode}}) { next unless ($m eq 'txt' || $m eq 'html'); $tt->process( 'def_changes_' . $m . '.tmpl', $args{output}, $args{options}->{mode}{$m} ) || die $tt->error(), "\n"; } } sub create_email { my %args = (@_); my $tt = $args{tt}; $logger->info("Processing the email report..."); my $body; $tt->process( 'def_changes_email.tmpl', $args{output}, \$body ) || die $tt->error(), "\n"; # Construct the MIME::Lite object. my $mail = MIME::Lite->new( From => $args{options}->{email_from}, To => $args{options}->{email_to}, # Bcc => $maintainer}, Subject => 'New ' . $defaults->{html}{ontology_name} . ' Definition Changes', Encoding => 'quoted-printable', Data => $body, Type => 'text/html', ); # This is added to prevent post-send attachments (e.g. as part # of an email list) messing up the html segment. Comment out # if this is unnecessary. $mail->attach( Data => ' ', Type => 'text/plain', ); # Finally, send the mail. if ($args{options}->{smtp_server}) { $mail->send( 'smtp', $args{options}->{smtp_server} ) || $logger->error("Could not send mail!"); } else { $mail->send() || $logger->error("Could not send mail!"); } $logger->info( $mail->as_string ); $logger->info("Finished processing email report!"); } sub create_rss { my %args = (@_); my $tt = $args{tt}; my $def_file = $args{options}->{rss_path} . 'def_diffs.rss'; my $date = $args{output}->{date_object}; my $old = $date->clone->subtract( months => 12 ); my $parser = DateTime::Format::Strptime->new(pattern => "%a, %d %b %Y %H:%M:%S %z"); ## create the def rss my $old_data = GOBO::FileCompareExtras::trim_rss( file => $def_file, date => $old ); $tt->process( 'def_changes_rss.tmpl', { %{$args{output}}, old_data => $old_data }, $def_file, ) || die $tt->error(), "\n"; } # parse the options from the command line sub parse_options { my ($opt, $args) = @_; my $errs; while (@$args && $args->[0] =~ /^\-/) { my $o = shift @$args; ## file to use if r1/d1 and r2/d2 are being used if ($o eq '-f' || $o eq '--file' || $o eq '--file') { if (@$args && $args->[0] !~ /^\-/) { $opt->{f} = shift @$args; } } elsif ($o eq '-f1' || $o eq '--file_1' || $o eq '--file_one') { if (@$args && $args->[0] !~ /^\-/) { $opt->{f1} = shift @$args; } } elsif ($o eq '-f2' || $o eq '--file_2' || $o eq '--file_two') { if (@$args && $args->[0] !~ /^\-/) { $opt->{f2} = shift @$args; } } elsif ($o eq '-d1' || $o eq '--date_1' || $o eq '--date_one') { if (@$args && $args->[0] !~ /^\-/) { $opt->{d1} = shift @$args; $opt->{d1} =~ s/(^["']|["']$)//g; } } elsif ($o eq '-d2' || $o eq '--date_2' || $o eq '--date_two') { if (@$args && $args->[0] !~ /^\-/) { $opt->{d2} = shift @$args; $opt->{d2} =~ s/(^["']|["']$)//g; } } elsif ($o eq '-r1' || $o eq '--rev_1' || $o eq '--rev_one') { if (@$args && $args->[0] !~ /^\-/) { $opt->{r1} = shift @$args; $opt->{r1} =~ s/(^["']|["']$)//g; } } elsif ($o eq '-r2' || $o eq '--rev_2' || $o eq '--rev_two') { if (@$args && $args->[0] !~ /^\-/) { $opt->{r2} = shift @$args; $opt->{r2} =~ s/(^["']|["']$)//g; } } elsif ($o eq '--rss-path') { if (@$args && $args->[0] !~ /^\-/) { $opt->{rss_path} = shift @$args; $opt->{rss_path} .= '/' unless $opt->{rss_path} =~ /\/$/; } } elsif ($o eq '--config') { if (@$args && $args->[0] !~ /^\-/) { my $a = shift @$args; if ($a =~ /(.+)\s*=(.*)/) { my ($k,$v) = ($1,$2); my @lookup = split(/\//,$k); if (@lookup ==1) { $opt->{$k} = $v; } elsif (@lookup ==2) { $opt->{$lookup[0]}->{$lookup[1]} = $v; } elsif (@lookup ==3) { $opt->{$lookup[0]}->{$lookup[1]}->{$lookup[2]} = $v; } else { die "@lookup"; } } else { die $a; } } } elsif ($o eq '-o' || $o eq '--output') { if (@$args && $args->[0] !~ /^\-/) { $opt->{output} = shift @$args; } } elsif ($o eq '-m' || $o eq '--mode') { while (@$args && $args->[0] !~ /^\-/) { my $m = shift @$args; $m = lc($m); if (grep { $m eq $_ } (qw(txt text html rss email))) { $m = 'txt' if $m eq 'text'; $opt->{mode}{$m} = 1; } } } elsif ($o eq '-c' || $o eq '--db_counts') { $opt->{db_counts} = 1; require DBI; require DBD::mysql; } elsif ($o eq '-h' || $o eq '--help') { system("perldoc", $0); exit(0); } elsif ($o eq '-v' || $o eq '--verbose') { $opt->{verbose} = 1; } elsif ($o eq '--galaxy') { $opt->{galaxy} = 1; } else { push @$errs, "Ignored nonexistent option $o"; } } return check_options($opt, $errs); } # process the input params sub check_options { my ($opt, $errs) = (@_); if (!$opt) { GOBO::Logger::init_with_config( 'standard' ); $logger = GOBO::Logger::get_logger(); $logger->logdie("Error: please ensure you have specified the input file(s) and/or date(s)/revision(s) and an output file.\nThe help documentation can be accessed with the command\n\t" . scr_name() . " --help"); } if (! $opt->{verbose}) { $opt->{verbose} = $ENV{GO_VERBOSE} || 0; } if ($opt->{galaxy}) { GOBO::Logger::init_with_config( 'galaxy' ); $logger = GOBO::Logger::get_logger(); } elsif ($opt->{verbose} || $ENV{DEBUG}) { GOBO::Logger::init_with_config( 'verbose' ); $logger = GOBO::Logger::get_logger(); } else { GOBO::Logger::init_with_config( 'standard' ); $logger = GOBO::Logger::get_logger(); } if ($errs && @$errs) { foreach (@$errs) { $logger->error($_); } } undef $errs; if (! $opt->{mode}) { $opt->{mode}{html} = 1; } ## make sure that html and text files have an output specified if (! $opt->{output} && ( $opt->{mode}{html} || $opt->{mode}{txt} )) { push @$errs, "specify an output file using -o /path/to/"; } else { if ($opt->{mode}{html} && $opt->{mode}{txt}) { ## use the file name from $opt->{output} plus suffix foreach my $m (qw( html txt )) { $opt->{mode}{$m} = $opt->{output} . "." . $m; ## make sure that if the file exists, we can write to it if (-e $opt->{output} && ! -w $opt->{output}) { push @$errs, $opt->{output} . " already exists and cannot to be written to"; } } } elsif ($opt->{mode}{html} || $opt->{mode}{txt}) { foreach my $m (qw( html txt )) { if ($opt->{mode}{$m}) { ## give the file the appropriate suffix if lacking if ($opt->{output} !~ /\.$m$/ && ! $opt->{galaxy}) { $opt->{mode}{$m} = $opt->{output} . ".$m"; $logger->warn($m . " output will be saved in file " . $opt->{mode}{$m}); } else { $opt->{mode}{$m} = $opt->{output}; } ## make sure that if the file exists, we can write to it if (-e $opt->{mode}{$m} && ! -w $opt->{mode}{$m}) { push @$errs, $opt->{mode}{$m} . " already exists and cannot to be written to"; } } } } } ## INPUT OPTIONS: ## - specify f1 and f2 ## - specify f, r1/d1, r2/d2 ## - specify f, r1/d1 and use most recent file as r2/d2 if ($opt->{f1} || $opt->{f2}) { foreach my $f (qw(f1 f2)) { if (!$opt->{$f}) { push @$errs, "specify an input file using -$f /path/to/"; } elsif (! -e $opt->{$f}) { push @$errs, "the file " . $opt->{$f} . " could not be found."; } elsif (! -r $opt->{$f} || -z $opt->{$f}) { push @$errs, "the file " . $opt->{$f} . " could not be read."; } } } elsif ($opt->{d1} || $opt->{d2} || $opt->{r1} || $opt->{r2}) { ## make sure that f is specified!! if (! $opt->{f}) { push @$errs, "specify an input file using -f /path/to/"; } ## make sure that we don't have mixed params if ($opt->{r1} && $opt->{d1} || $opt->{r1} && $opt->{d2} || $opt->{r2} && $opt->{d1} || $opt->{r2} && $opt->{d2}) { push @$errs, "please use either date or revisions, not both"; } ## make sure that we have d1 / r1 if we have d2 / r2 if ( $opt->{d2} && ! $opt->{d1} || $opt->{r2} && ! $opt->{r1} ) { push @$errs, "r2/d2 is specified but r1/d1 is not"; } if (! $errs) { ## OK, if we are getting files from CVS, we need to fetch the files and store them somewhere ## move any existing file out of the way my $f_name = $opt->{f}; $logger->debug( "f_name: " . $f_name ); if (-e $f_name) { my $cmd = "cp $f_name $f_name" . "-current"; `$cmd`; warn "Moved current $f_name to $f_name" . "-current"; $opt->{f_moved} = $f_name . "-current"; } foreach my $x (qw( r d )) { foreach my $n (qw(1 2)) { next unless $opt->{ $x . $n }; my $temp = File::Temp->new(); my $cmd = "perl " . $defaults->{dist_path} . "/bin/cvs-retriever.pl -v -" . $x . " " . $opt->{$x.$n}. " -o " . $temp->filename; if ($opt->{galaxy}) { $cmd .= " -g"; } elsif ($opt->{verbose}) { $cmd .= " -v"; } $logger->info("cmd: $cmd"); my $status; eval { $status = `$cmd` }; $logger->info("status: $status"); if ($@) { $logger->error("CVS problem: $@"); } else { ## turn the file into a temporary file $opt->{"f" . $n} = $temp; } } } ## second file not specified: use the current date instead if (! $opt->{r2} && ! $opt->{d2}) { my $temp = File::Temp->new(); my $cmd = "perl " . $defaults->{dist_path} . "/bin/cvs-retriever.pl -v -o " . $temp->filename; if ($opt->{galaxy}) { $cmd .= " -g"; } elsif ($opt->{verbose}) { $cmd .= " -v"; } my $status; eval { $status = `$cmd` }; $logger->info("status: $status"); if ($@) { $logger->error("CVS problem: $@"); } else { ## turn the file into a temporary file $opt->{f2} = $temp; } } } } else { push @$errs, "specify either a pair of files to compare or a file name and dates or revisions to compare"; } ## make sure that we can find the template directory! my @paths = split(":", $opt->{inc_path}); my $pass; foreach (@paths) { $_ =~ s/\/$//; foreach my $m (keys %{$opt->{mode}}) { if (-e $_ . "/def_changes_" . $m . '.tmpl') { $pass->{$m}++; # last; } } } if (! $pass || scalar keys %{$opt->{mode}} != scalar keys %$pass) { push @$errs, "could not find the template file; check the paths in \$defaults->{inc_path}"; } if ($errs && @$errs) { $logger->logdie("Please correct the following parameters to run the script:\n" . ( join("\n", map { " - " . $_ } @$errs ) ) . "\nThe help documentation can be accessed with the command\n\t" . scr_name() . " --help"); } return $opt; } sub get_db_counts { my %args = (@_); my $options = $args{options}; my $data = $args{data}; if (! $args{options} || ! $options->{dbdriver} || ! $options->{dbname} || ! $options->{dbhost}) { $logger->error("Missing parameters for DB connection!"); return; } ## connect my $dsn = "DBI:" . $options->{dbdriver} . ":database=" . $options->{dbname} . ";host=" . $options->{dbhost}; if ($options->{dbport}) { $dsn .= ";port=" . $options->{dbport}; } my $dbh = DBI->connect( $dsn, $options->{dbuser}, $options->{dbpass} ) or warn "Could not connection to the database: " . $DBI::errstr; if ($dbh) { my $i_data = $dbh->selectall_hashref( 'SELECT * FROM instance_data', 'release_name' ); ## save this info foreach (keys %$i_data) { $data->{db_data}{$_} = $i_data->{$_}; last; } $logger->info( "data->{db_data}: " . Dumper($data->{db_data})); ## get direct associations my $direct = $dbh->selectall_arrayref( 'SELECT term.acc, COUNT(association.id) FROM association INNER JOIN term ON term.id=association.term_id WHERE term.acc IN ("' . join('","', keys %{$data->{changed}}) . '") GROUP BY term.id' ); if ($direct && @$direct) { foreach (@$direct) { ## [ term.acc, count ] $data->{f2}{ $_->[0] }{direct} = $_->[1]; } $logger->info( "direct annots: " . Dumper($direct)); } ## get indirect associations my $indirect = $dbh->selectall_arrayref( 'SELECT term.acc, COUNT(association.id) FROM term INNER JOIN graph_path ON term.id=graph_path.term1_id INNER JOIN association ON association.term_id=graph_path.term2_id WHERE term.acc IN ("' . join('","', keys %{$data->{changed}}) . '") GROUP BY term.id' ); if ($indirect && @$indirect) { foreach (@$indirect) { ## [ term.acc, count ] $data->{f2}{ $_->[0] }{indirect} = $_->[1]; } $logger->info( "indirect annots: " . Dumper($indirect)); } } } sub scr_name { my $n = $0; $n =~ s/^.*\///; return $n; } 1; { package HTMLTwoLinesLite; use strict; use HTML::Entities qw(encode_entities); use base 'Text::WordDiff::Base'; use Data::Dumper; sub file_header { my $self = shift; my $header = $self->SUPER::file_header(@_); if ($header) { $self->{__str1} = $self->{__str2} = qq{$header}; } else { $self->{__str1} = $self->{__str2} = ''; } return ''; return '' unless $header; return qq{$header}; } sub hunk_header { my $self = shift; # $self->{__str1} .= ''; # $self->{__str2} .= ''; return ''; } sub hunk_footer { my $self = shift; # $self->{__str1} .= ''; # $self->{__str2} .= ''; return ''; } sub file_footer { my $self = shift; $self->{__str1} .= ''; $self->{__str2} .= ''; return $self->{__str1} . $self->{__str2}; } sub same_items { my $self = shift; $self->{__str1} .= encode_entities( join '', @_ ); $self->{__str2} .= encode_entities( join '', @_ ); return ''; } sub delete_items { my $self = shift; $self->{__str1} .= '' . encode_entities( join '', @_ ) . ''; return ''; } sub insert_items { my $self = shift; $self->{__str2} .= '' . encode_entities( join '', @_ ) . ''; return ''; } 1; } 1; ################################################################################ =head1 AUTHOR Amelia Ireland =cut