#!/usr/bin/perl require 5.000; use strict; my $rDir = '.'; my $opt_string = 'hr:oe'; our %opt; sub usage() { print STDERR << "EOF"; [eventview.pl] last updated by jdkim\@is.s.u-tokyo.ac.jp on 16 Feb, 2008. It is written to support the BioNLP09 Shared Task: http://www-tsujii.is.s.u-tokyo.ac.jp/GENIA/SharedTask/ It shows the standoff files in a readable shape. DESCRIPTION : PMID-event_ID [TAB] [N|S|Z] event_class [TAB] event_trigger ARG_text ... EXAMPLE : 9796702-E9 N +Regulation induce Akt p21ras * 'N' stands for 'Negation', 'S' stands for 'Speculation', 'Z' stands for both. $0 [-$opt_string] a2_file(s) -h : this (help) message. -r dir : specifies the directory in which the *.txt and *.a1 files are placed. (default = '$rDir/') -o : tells it to show offsets of each text span. -e : tells it to show excerpts of each event. EOF exit; } use Getopt::Std; getopts("$opt_string", \%opt) or usage(); usage() if $opt{h}; usage() if $#ARGV < 0; if ($opt{r}) {$rDir = $opt{r}; $rDir =~ s/\/$//} ## annotations. # - they are referenced globally. # - should be initialized for every file. my ($text, $textlen); my (%anno, %mod); my $pmid; foreach my $fname (@ARGV) { my $suffix; if ($fname =~ /([0-9]+)(\.a2\.t12?3?$|\.a2$)/) {$pmid = $1; $suffix = $2} else {print STDERR "Unrecognizable filename: $fname\n"; next} ## local initialization of annotations $text = ''; %anno = %mod = (); # read text file if (!open (FILE, "<:utf8", "$rDir/$pmid.txt")) {print STDERR "cannot open text file: $rDir/$pmid.txt\n"; next} while () {$text .= $_} close (FILE); $textlen = length $text; # read annotatin files &read_so_file ("$rDir/$pmid.a1") or next; &read_so_file ($fname) or next; # output my @eid = (); foreach (keys %anno) {if ($_ =~ /^E/) {push @eid, $_}} @eid = sort {${$anno{${$anno{$a}}[1]}}[1] <=> ${$anno{${$anno{$b}}[1]}}[1]} @eid; foreach (@eid) { my ($type, $etid, @arg) = @{$anno{$_}}; $type =~ s/^Positive_r/+R/; $type =~ s/^Negative_r/-R/; my $mod = $mod{$_}? $mod{$_} : ' '; my $exp = "$pmid-$_\t$mod $type\t" . &tspan($etid); my @range = (); push @range, [&trange($etid)]; foreach (@arg) { my ($atype, $atid) = split ':', $_; $exp .= " <$atype>" . &tspan($atid); if (&trange($atid)) {push @range, [&trange($atid)]} } # foreach @range = sort {$$a[0] <=> $$b[0]} @range; @range = &range_uniq (@range); my $excerpt = substr ($text, $range[0][0], $range[$#range][1] - $range[0][0]); my $base = $range[0][0]; my @erange = map {[$$_[0] - $base, $$_[1] - $base]} @range; for (my $i = $#erange; $i >= 0; $i--) { substr ($excerpt, ${$erange[$i]}[1], 0) = ']'; substr ($excerpt, ${$erange[$i]}[0], 0) = '['; } # for if ($opt{e}) {$exp .= "\t$excerpt"} print "$exp\n"; } # foreach } # foreach sub tspan { my ($id) = @_; if ($id =~ /^T/) { my ($beg, $end) = (${$anno{$id}}[1], ${$anno{$id}}[2]); if ($opt{o}) {return substr ($text, $beg, $end - $beg) . "[$beg-$end]"} else {return substr ($text, $beg, $end - $beg)} } # if else {return $id} } # tspan sub trange { my ($id) = @_; if ($id =~ /^T/) {return (${$anno{$id}}[1], ${$anno{$id}}[2])} } # tspan sub range_uniq { my (@range) = @_; my %seen = (); my @urange = grep !$seen{"$$_[0]-$$_[0]"}++, @range; return @urange; } # range_uniq sub read_so_file { my ($fname) = @_; if (!open (FILE, "<", $fname)) {print STDERR "cannot open the file: $fname\n"; return ''} my @line = ; chomp (@line); close (FILE); foreach (@line) { my ($id, $exp) = split /\t/; if (/^T/) { my ($type, $beg, $end) = split ' ', $exp; $anno{$id} = [$type, $beg, $end]; } # if elsif (/^E/) { my @arg = split ' ', $exp; my ($type, $tid) = split ':', shift @arg; if (($tid !~ /^T[0-9]+$/) || (!$anno{$tid})) {print STDERR "invalid reference to text annnotation: [$pmid]\t$_\n"} if ($type ne ${$anno{$tid}}[0]) {print STDERR "inconsistent event type: [$pmid]\t$_\n"} $anno{$id} = [$type, $tid, @arg]; } # elsif elsif (/^M/) { my ($mod, $eid) = split ' ', $exp; $mod = substr ($mod, 0, 1); if (($mod ne 'N') && ($mod ne 'S')) {print STDERR "invalid type of event modification: [$pmid]\t$_\n"} if (($eid !~ /^E[0-9]+$/) || !$anno{$eid}) {print STDERR "invalid reference to event annotation: [$pmid]\t$_\n"} if ($id =~ /^M/) { if ($mod{$eid} && ($mod{$eid} ne $mod)) {$mod{$eid} = 'Z'} else {$mod{$eid} = $mod} } # if } # elsif elsif (/^*/) {} else {print STDERR "invalid ID prefix: [$pmid]\t$_\n"} } # foreach return 1; } # read_so_file