#!/usr/bin/perl require 5.000; use strict; my %target_eclass = ( ('Localization', 1), ('Binding', 1), ('Gene_expression', 1), ('Transcription', 1), ('Protein_catabolism', 1), ('Phosphorylation', 1), ('Regulation', 1), ('Positive_regulation', 1), ('Negative_regulation',1 ) ); my $rDir = '.'; my $opt_string = 'har:'; our %opt; sub usage() { print STDERR << "EOF"; [standoff-check] last updated by jdkim\@is.s.u-tokyo.ac.jp on 19 Feb 2009 It performs formt checking of /*.a2.t12?3?/ files. It is developed to support the BioNLP09 Shared Task. Each line in the /*.a2.t12?3?/ files to submit have to be formatted as follows: 1) entity / event trigger annotation T[1-9][0-9]*(Entity|event_type)(0|[1-9][0-9]*)[1-9][0-9]* ------------ --------------- ----------- ID offset-begin offset-end 2) event annotation E[1-9][0-9]*event_type:ID_to_trigger(argument_type:ID_to_entity)+ 3) modifier annotation M[1-9][0-9]*(Negatin|Speculation)ID_to_event $0 [-$opt_string] answer_a2_file(s) *every answer_a2_file has to have one of the valid suffixes (.a2.t1, .a2.t12, .a2.t13, or .a2.t123). -h : this (help) message. -a : tells it to allow Equiv annotation. Please do not use it if you do not have a particular reason. -r dir : specifies the directory in which the *.txt and *.a1 files are placed. EOF exit; } use Getopt::Std; getopts("$opt_string", \%opt) or &usage; usage() if $opt{h}; usage() if $#ARGV < 0; if ($opt{r}) {$rDir = $opt{r}} # suffix validness check with the first file my $suffix = ''; if ($ARGV[0] =~ /(\.a2\.t12?3?$|\.a2$)/) {$suffix = $1} else {print STDERR "Invalid suffix: $ARGV[0]\n"; &usage} if (($suffix eq '.a2') && !$opt{a}) {print STDERR "Invalid suffix: $ARGV[0]\n"; &usage} # suffix consistency check my $good = 1; foreach (@ARGV) { if ($_ !~ /$suffix$/) {print STDERR "inconsistent file suffix: $_\n"; $good = 0} } # foreach if (!$good) {exit} # per-text variables referenced globally. Should be initialzed for each file. my ($text, $textlen); my %anno; # annotations my @equiv; # list equivalent groups my $pmid; foreach my $fname (@ARGV) { $fname =~ /([1-9][0-9]*)$suffix$/; $pmid = $1; # initialize per-text variables $text = ''; %anno = @equiv = (); &read_text_file("$rDir/$pmid.txt"); &read_a1_file("$rDir/$pmid.a1"); &read_a2_file($fname); # id-linkage verification after reading all the annotations foreach my $eid (keys %anno) { if ($eid =~ /^E/) { my ($etype, $etid, @arg) = @{$anno{$eid}}; my (@aid, @oid) = (); # argument id, non-protein entity id foreach (@arg) { if (/:/) { my ($tid, $sid) = split ':'; push (@aid, ($tid, $sid)); push (@oid, $sid); } # if else {push @aid, $_} } # foreach if ($etype ne 'Binding') { for (my $i = 1; $i <= $#arg; $i++) { if (($etype !~ /egulation$/) || ($i != 1)) {push @oid, $arg[$i]} } # for } # for if ($etid !~ /^T/) {print STDERR "event expression has to be linked to a text annotation: [$pmid] $eid: $etid?\n"} if ($etype ne ${$anno{$etid}}[0]) {print STDERR "inconsistent event type linkage: [$pmid] $eid: $etid?\n"} # print STDERR join (", ", @aid) , "\n"; foreach my $aid (@aid) { if ($aid eq '') {next} if (!$anno{$aid}) {print STDERR "unknown-id reference: [$pmid] $eid: $aid?\n"} if (($etype !~ /egulation$/) && ($aid !~ /^T/)) {print STDERR "non-regulation type event has a reference to event: [$pmid] $eid: $aid?\n"} } # foreach foreach my $oid (@oid) { if ($oid == 0) {next} if (${$anno{$oid}}[0] ne 'Entity') {print STDERR "arguments that are not theme or cause have to reference to an 'Entity' type entity: [$pmid] $eid: $oid?\n"} } # foreach } # if elsif ($eid =~ /^M/) { my ($mod, $aid, $extra) = @{$anno{$eid}}; if (($aid !~ /^E/) || (!$anno{$eid})) { print STDERR "invalid ID reference: [$pmid] $eid: $aid?\n"; } # if } # elsif } # foreach foreach (@equiv) { my @egroup = @$_; foreach (@egroup) { if (${$anno{$_}}[0] ne 'Protein') {print STDERR "non-protein entity in Equiv relation: $pmid\t[", join (",", @egroup), "]\n"} } # foreach } # foreach # count the number of events referencing each term my %nref = (); foreach my $eid (keys %anno) { if ($_ =~ /^E/) { my @aid = map {${split ':'}[0]} @{${$anno{$eid}}[2]}; foreach (@aid) { if ($nref{$_}) {$nref{$_}++} else {$nref{$_}=1} } # foreach } # if } # foreach # check if only one of each equiv set is referenced foreach (@equiv) { my @egroup = @$_; my $num_term_referenced = 0; foreach (@egroup) { if ($nref{$_}) {$num_term_referenced++} } # foreach if ($num_term_referenced > 1) {print STDERR "multiple terms in a equiv group are referenced: $pmid\t[", join (",", @egroup), "]\n"} } # foreach # check duplication my %seen = (); foreach my $eid (keys %anno) { my $anno = join ' ', @{$anno{$eid}}; if ($seen{$anno}) {print STDERR "duplicated event: [$pmid] $eid = $seen{$anno}\n"} else {$seen{$anno} = $eid} #; if ($eid =~ /^E/) {print STDERR "$eid\t$anno\n"} } # for ($i) } # foreach sub read_text_file { my ($fname) = @_; if (!open (FILE, "<", "$fname")) {print STDERR "cannot open the file: $fname\n"; return} while () {$text .= $_} close (FILE); $textlen = length $text; } # read_text_file sub read_a1_file { my ($fname) = @_; if (!open (FILE, "<", $fname)) {print STDERR "cannot open the file: $fname\n"; return} while () { chomp; if (!/^[TEM][1-9][0-9]*\t[A-Z]/) { print STDERR "please be sure that ID and the remaining annotation are delimited by TAB: [$pmid] $_\n"; next; } my ($id, $anno) = split /\t/; if ($id =~ /^T/) { my ($ttype, $beg, $end, $str) = split / /, $anno; if (($beg !~ /^[0-9]+$/) || ($end !~ /^[0-9]+$/)) {print STDERR "non-numeral offset. please be sure that elements of an annotation are delimited by SPACE: [$pmid] $_\n"} if ($ttype ne 'Protein') {print STDERR "non-protein entity in a1 file: [$pmid] $_\n"} if (!&rangep($beg, $end)) {print STDERR "invalid text range: [$pmid] $_\n"} $anno{$id} = [$ttype, $beg, $end]; } # if else { print STDERR "invalid ID prefix for a1 file: [$pmid] $_\n"; } # else } # while close (FILE); } # read_so_file sub read_a2_file { my ($fname) = @_; if (!open (FILE, "<", $fname)) {print STDERR "cannot open the file: $fname\n"; return} while () { chomp; if (!/^[TEM][1-9][0-9]*\t[A-Z]/ && !/^*\tEquiv/) { print STDERR "please be sure that ID and the remaining annotation are delimited by TAB: [$pmid] $_\n"; next; } # if my ($id, $anno) = split /\t/; if ($id =~ /^T/) { my ($ttype, $beg, $end, $str) = split / /, $anno; if (($beg !~ /^[0-9]+$/) || ($end !~ /^[0-9]+$/)) { print STDERR "non-numeral offset. please be sure that elements of an annotation are delimited by SPACE: [$pmid] $_\n"; next; } #if if ($target_eclass{$ttype} || ($ttype eq 'Entity')) {} else {print STDERR "invalid entity type for a2 file: [$pmid] $_\n"} if (!&rangep($beg, $end)) {print STDERR "invalid text range: [$pmid] $_\n"} if ($anno{$id}) {print STDERR "duplicated entity ID: [$pmid] $_\n"} $anno{$id} = [$ttype, $beg, $end]; } # if elsif ($id =~ /^E/) { my @arg = split ' ', $anno; if ($#arg < 1) { print STDERR "no argument. please be sure that elements of an annotation are delimited by SPACE: [$pmid] $_\n"; next; } # if my @exp = (); my ($type, $tid) = split ':', shift @arg; if (!$target_eclass{$type}) {print STDERR "invalid event type: [$pmid] $_\n"} if ($#arg < 0) {print STDERR "event with no argument: [$pmid] $_\n"} my ($theme, $cause, $site, $csite, $atloc, $toloc) = ('', '', '', '', '', ''); my @theme = (); if ($type eq 'Binding') { my (@rtheme, %themei, %sitei, %site) = (); foreach (@arg) { my ($type, $tid) = split ':'; if ($type =~ /^Theme([2-5]?)/) { push @rtheme, $tid; my $i = ($1)? $1:1; $themei{$i} = $tid; } # if elsif ($type =~ /^Site([2-5]?)/) { my $i = ($1)? $1:1; $sitei{$i} = $tid; } # elsif else { print STDERR "invalid argument: [$pmid] $_\n"; } # elsif } # foreach ## index sites for themes foreach (keys %sitei) {$site{$themei{$_}} = $sitei{$_}} ## sort & uniq @rtheme = sort {$a cmp $b} @rtheme; my $prev = 'nonsuch'; @theme = grep ($_ ne $prev && (($prev) = $_), @rtheme); if ($#theme != $#rtheme) {print STDERR "duplicated themes: [$pmid] $_\n"} my @narg = (); foreach (@theme) { if ($site{$_}) {push @narg, "$_:$site{$_}"} else {push @narg, $_} } # foreach @theme = @narg; } # if else { foreach (@arg) { my ($type, $tid) = split ':'; if ($type eq 'Theme') {$theme = $tid} elsif ($type eq 'Cause') {$cause = $tid} elsif ($type eq 'Site') {$site = $tid} elsif ($type eq 'CSite') {$csite = $tid} elsif ($type eq 'AtLoc') {$atloc = $tid} elsif ($type eq 'ToLoc') {$toloc = $tid} else {print STDERR "invalid argument: [$pmid] $_\n"} } # foreach } # else if ($type eq 'Binding') {if (!@theme) {print STDERR "event with no theme: [$pmid] $_\n"} @exp = ($type, $tid, @theme)} else {if (!$theme) {print STDERR "event with no theme: [$pmid] $_\n"} @exp = ($type, $tid, $theme)} if ($type eq 'Localization') { if ($toloc) {push @exp, ($atloc, $toloc)} elsif ($atloc) {push @exp, ($atloc)} if ($cause || $site || $csite) {print STDERR "invalid argument: [$pmid] $_\n"} } # if elsif ($type eq 'Phosphorylation') { if ($site) {push @exp, $site} if ($cause || $csite || $atloc || $toloc) {print STDERR "invalid argument: [$pmid] $_\n"} } # elsif elsif ($type =~ /egulation$/) { if ($csite) {push @exp, ($cause, $site, $csite)} elsif ($site) {push @exp, ($cause, $site)} elsif ($cause) {push @exp, $cause} if ($atloc || $toloc) {print STDERR "invalid argument: [$pmid] $_\n"} } # elsif else { if ($cause || $site || $csite || $atloc || $toloc) {print STDERR "invalid argument: [$pmid] $_\n"} } # else if (($suffix !~ /a2$/) && ($suffix !~ /t12/) && ($site || $csite || $atloc || $toloc)) {print STDERR "invalid argument for this task: [$pmid] $_\n"} if ($anno{$id}) {print STDERR "duplicated event ID: [$pmid] $_\n"} $anno{$id} = [@exp]; } # if elsif ($id =~ /^M/) { my ($mod, $aid) = split ' ', $anno; if (($mod ne 'Negation') && ($mod ne 'Speculation')) {print STDERR "invalid type of event modification: [$pmid] $_\n"} if (($suffix !~ /a2$/) && ($suffix !~ /3/)) {print STDERR "invalid type of annotation for this task: [$pmid] $_\n"} if ($anno{$id}) {print STDERR "duplicated modifier ID: [$pmid] $_\n"} $anno{$id} = [$mod, $aid]; } # elsif elsif ($id eq '*') { my ($rel, @pid) = split ' ', $anno; if ($rel ne 'Equiv') {print STDERR "invalid type of relation: [$pmid] $_\n"} if ($suffix ne '.a2') {print STDERR "invalid type of annotation for this task: [$pmid] $_\n"} push @equiv, [@pid]; } # elsif else { print STDERR "invalid ID prefix for a2 file: [$pmid] $_\n"; } # else } # while close (FILE); } # read_a2_file sub rangep { my ($beg, $end) = @_; if (((($beg eq '0') || ($beg =~ /^[1-9][0-9]*$/)) && ($end =~ /^[1-9][0-9]*$/)) && (($beg >= 0) && ($end <= $textlen) && ($beg < $end))) {return 1} else {return 0} } # rangep