<?php # $Id: index.php,v 1.2 2007/01/19 19:41:17 gmaster Exp gmaster $
#
$BASE_DIR=$_SERVER["DOCUMENT_ROOT"];
$CODE_DIR=$BASE_DIR.'/lib';
require($BASE_DIR.'/bc.php');
require($BASE_DIR.'/datasets/bc.php');
require($BASE_DIR.'/datasets/egasp2005/bc.php');
require($CODE_DIR.'/Genome.php');
#
function mkplotrow($Rlabel,$RbaseS,$RbaseC,$RbaseG) # HavanaCodingGenes_ HavanaGenes_
{
  echo '<tr><th align="right" style="padding-right:15px;"><b>'.$Rlabel.'</b></th>';
  $rlabel = ereg_replace('[[:blank:]]*<br[[:blank:]]*\/?>[[:blank:]]*',' ',$Rlabel);
  if ($RbaseS !== '') {
    $url = 'figs/HavanaCodingGenes_seqs_'.$RbaseS;
    $fig = '<img src="'.$url.'.s.png" alt="'.$rlabel.'" width="200" />';
    $lbl = 'Sequence projection of (Sn+Sp)/2 for '.$rlabel.' (annotation set Havana Coding Genes)';
    $xtr = '<br />['.hrefs($url.'.ps','PS','PostScript version',1).']'.
                 '['.hrefs($url.'.pdf','PDF','PDF version',1).']'.
                 '['.hrefs($url.'.png','PNG','PNG high-res version',1).']';
    echo '<td align="center">'.hrefs($url.'.png',$fig,$lbl).$xtr.'</td>';
  } else {
    echo '<td>&nbsp;</td>';
  };
  if ($RbaseC !== '') {
    $url = 'figs/HavanaCodingGenes_'.$RbaseC;
    $fig = '<img src="'.$url.'.s.png" alt="'.$rlabel.'" width="200" />';
    $lbl = '(Sn+Sp)/2 for '.$rlabel.' (annotation set Havana Coding Genes)';
    $xtr = '<br />['.hrefs($url.'.ps','PS','PostScript version',1).']'.
                 '['.hrefs($url.'.pdf','PDF','PDF version',1).']'.
                 '['.hrefs($url.'.png','PNG','PNG high-res version',1).']';
    echo '<td align="center">'.hrefs($url.'.png',$fig,$lbl).$xtr.'</td>';
  } else {
    echo '<td>&nbsp;</td>';
  };
  if ($RbaseG !== '') {
    $url = 'figs/HavanaGenes_'.$RbaseG;
    $fig = '<img src="'.$url.'.s.png" alt="'.$rlabel.'" width="200" />';
    $lbl = '(Sn+Sp)/2 for '.$rlabel.' (annotation set Havana Genes)';
    $xtr = '<br />['.hrefs($url.'.ps','PS','PostScript version',1).']'.
                 '['.hrefs($url.'.pdf','PDF','PDF version',1).']'.
                 '['.hrefs($url.'.png','PNG','PNG high-res version',1).']';
    echo '<td align="center">'.hrefs($url.'.png',$fig,$lbl).$xtr.'</td>';
  } else {
    echo '<td>&nbsp;</td>';
  };
  echo '</tr>'."\n";
} # mkplotrow
#
the_header('DATASETS: "Guigo et al, Genome Biology, , 2006."',
           'EGASP, GASP, genome annotation assessment project, sensitivity, specificity, gene prediction evaluation', # keywords
	   'Evaluating EGASP&#39;05 predictions accuracy');
?>

<?php the_title('ANALYSIS of EGASP&#39;05 WORKSHOP PREDICTIONS'); ?>

<div align="center" class="center">
<h1>SUPPLEMENTARY MATERIALS FOR</h1>

<h3>"EGASP: The human ENCODE GENOME ANNOTATION ASSESSMENT PROJECT"</h3>
<a href="<?php echo mailify('mailto:rguigo@imim.es?subject='); ?>EGASP: the ENCODE Genome Annotation Assessment Project (GB, 2006)" title='Contact Roderic Guigo'>R. Guig&oacute;</a><sup><b>&nbsp;+&#64;</b></sup>, P. Flicek<sup><b>&nbsp;+</b></sup>, J. F. Abril<sup><b>&nbsp;+</b></sup>, A. Reymond, J. Lagarde, F. Denoeud, <br /> S. Antonarakis, M. Ashburner, V. B. Bajic, E. Birney, R. Castelo, E. Eyras, <br /> C. Ucla, T. R. Gingeras, J. Harrow, T. Hubbard, S. Lewis and <a href="<?php echo mailify('mailto:rguigo@imim.es?subject='); ?>EGASP: the ENCODE Genome Annotation Assessment Project (GB, 2006)" title='Contact Martin Reese'>M. Reese</a><sup><b>&nbsp;+&#64;</b></sup>. <br /><br class="hh" />
<b><i>Genome Biology</i>, 7(Suppl 1):S2, <!--7 august-->2006.</b> <br class="hh" />
<?php
 pmid(16925836,'PubMed'); echo '&nbsp;';
 pref('http://genomebiology.com/2006/7/S1/S2',
      'Full Text', 'View document at Genome Biology web site'); echo '&nbsp;';
 pref('http://genomebiology.com/content/pdf/gb-2006-7-s1-s2.pdf',
      'PDF', 'Get the PDF from Genome Biology'); echo '&nbsp;';
 pref('/datasets/egasp2005/',
      'Datasets', 'This dataset'); ?> <br /><br class="hh" />
<sup><b>+</b></sup> These authors contributed equally to this work. <br class="hh" />
<sup><b>&#64;</b></sup> To whom correspondence should be addressed. <br class="hh" />
<a href="<?php echo mailify('mailto:rguigo@imim.es?subject='); ?>EGASP: the ENCODE Genome Annotation Assessment Project (GB, 2006)" title='Contact Roderic Guigo'>R. Guig&oacute;</a> and 
<a href="<?php echo mailify('mailto:rguigo@imim.es?subject='); ?>EGASP: the ENCODE Genome Annotation Assessment Project (GB, 2006)" title='Contact Martin Reese'>M. Reese</a>. <br class="hh" /><br class="hh" />

<big><span style="font-weight: bold; background: #FF0000; color: #FFFFFF; padding-left: 5px; padding-right: 5px;">NEW</span></big> &nbsp; <a href="http://www.amazon.co.uk/Egasp-05-Annotation-Assessment-Project/dp/0954027825/ref=pd_bbs_sr_1?ie=UTF8&s=books&qid=1195146526&sr=8-1" title="EGASP Genome Biology Special Issue Now Available from Amazon">We are pleased to announce that the EGASP <i>Genome Biology</i> supplement is now available via AMAZON</a>. &nbsp; <big><span style="font-weight: bold; background: #FF0000; color: #FFFFFF; padding-left: 5px; padding-right: 5px;">NEW</span></big>

<br /><br /></div>


<?php hsubsec('Contents','TOC'); ?>

<?php
# csec('Evaluation of the Accuracy of the Predictions','Evaluation_of_the_Accuracy_of_the_Predictions') 
mk_toc(csec('Summary','Summary'),
       csec('Havana Curated Datasets','Havana_Curated_Datasets',
	    csubsec('Reference Annotation','Reference_Annotation'),
	    csubsec('Coding Features Annotation','Coding_Features_Annotation'),
	    csubsec('Annotation Statistics','Annotation_Statistics')
	    ),
       csec('Gene Finding Predictions Sets','Gene_Finding_Predictions_Sets',
	    csubsec('Files Submitted to EGASP&#39;05','Files_Submitted_to_EGASP05'),
	    csubsec('Post-Submissions','Post_Submissions')
	    ),
       csec('Evaluation Methods','Evaluation_Methods',
	    csubsec('Comparisons at Nucleotide Level','Comparisons_at_Nucleotide_Level'),
		    # csubsubsec('IMIM','Nucleotide_Level_IMIM'),
		    # csubsubsec('EBI', 'Nucleotide_Level_EBI' )
		    # ),
	    csubsec('Comparisons at Exon Level','Comparisons_at_Exon_Level'),
		    # csubsubsec('IMIM','Exon_Level_IMIM'),
		    # csubsubsec('EBI', 'Exon_Level_EBI' )
		    # ),
	    csubsec('Comparisons at Gene/Transcript Level','Comparisons_at_Gene_Transcript_Level',
		    csubsubsec('IMIM','Gene_Transcript_IMIM'),
		    csubsubsec('EBI', 'Gene_Transcript_EBI' )
		    )
	    ),
       csec('EGASP&#39;05 Evaluation Results','EGASP_05_Evaluation_Results',
	    csubsec('Nucleotide Level','Results_Nucleotide_Level'),
	    csubsec('Coding Exon Level','Results_CodingExon_Level'),
	    csubsec('Exon Level','Results_Exon_Level'),
            csubsec('Splice Sites','Results_Splice_Sites_Level'),
	    csubsec('Gene/Transcript Level','Results_Gene_Transcript_Level')
	    ),
       csec('EGASP&#39;05 Evaluations from Participants','EGASP_05_Evaluations_from_Participants',
	    csubsec('Mario Stanke','Mario_Stanke')
	    )
       );
?> <br />


<?php hsubsec('Summary','Summary'); ?><!-- ?php hsubsec('Evaluation of the Accuracy of the Predictions','Evaluation_of_the_Accuracy_of_the_Predictions'); ? -->

<p> Different evaluation programs were used to
compare the accuracy of the gene predictions submitted to the
<a href="/gencode/workshop2005.html"
 title="GENCODE EGASP&#39;05 workshop">GENCODE EGASP&#39;05 workshop</a>,
held at the Sanger Center on May 6-7, 2005. The
results from those evaluations are provided here, along with some
discussion on the different methods to calculate the accuracies of
each different approach at three levels of the gene structure
(basically at nucleotide, exon, transcript/gene levels). </p>


<?php hsec('Havana Curated Datasets','Havana_Curated_Datasets'); ?>

<p> Files for each annotation freeze are available at the following ftp repository: <br /></p>

<div class="center"><a href="ftp://genome.imim.es/pub/projects/gencode/data/havana-encode/"
 title="Annotation freeze ftp repository">ftp://genome.imim.es/pub/projects/gencode/data/havana-encode/</a>
</div>

<p> For the EGASP&#39;05 workshop evaluations, the
<a href="ftp://genome.imim.es/pub/projects/gencode/data/havana-encode/version00.2_29apr05/"
 title="April 29th, 2005, annotation data freeze">April 29th, 2005, freeze</a>
(<a href="ftp://genome.imim.es/pub/projects/gencode/data/havana-encode/version00.2_29apr05/"
  title="April 29th, 2005, annotation data freeze">version00.2_29apr05</a>) was used.
All the comparisons were calculated on
<a href="ftp://genome.imim.es/pub/projects/gencode/data/havana-encode/version00.2_29apr05/ENCODE_coord/"
 title="ENCODE regions relative coordinates">ENCODE regions relative coordinates</a>.
The CDS exon predictions were evaluated against the annotations in the
<a href="ftp://genome.imim.es/pub/projects/gencode/data/havana-encode/version00.2_29apr05/ENCODE_coord/genes_with_cds/"
 title="Annotated genes with CDS exons">genes_with_cds</a> subdirectory.
Complete gene predictions (without distinguishing between UTR/CDS exons) were compared
with the annotations from the
<a href="ftp://genome.imim.es/pub/projects/gencode/data/havana-encode/version00.2_29apr05/ENCODE_coord/genes_known_validated/" title="Annotated validated know genes">genes_known_validated</a> subdirectory.
For some of the evaluation procedures, a nucleotide or an exon
projection was required. The same transformations were performed onto
the predicted genic structures and the curated ones. Those
modifications are described in more detail in the
<a href="#Evaluation_Methods" title="Evaluating predictions accuracy">corresponding methods section</a>. </p>
<p> The following description of the annotation datasets was adapted from an email by
<a href="<?php echo mailify('mailto:fdenoeud@imim.es'); ?>" title="Contact France Denoeud"><i>France Denoeud</i></a>. </p>

<?php hsubsec('Reference Annotation','Reference_Annotation'); ?>

<p> The file to consider as the "reference annotation" is <a href="ftp://genome.imim.es/pub/projects/gencode/data/havana-encode/version00.2_29apr05/ENCODE_coord/genes_known_validated/44regions_genes.gtf.gz" title="Download reference annotation in GTF format">44regions_genes.gtf.gz</a>.
It contains HAVANA-GENCODE annotations belonging to the following
categories: </p>

<dl>
 <dt> Known&nbsp; </dt>
     <dd> Known protein coding genes (referenced in Entrez Gene, NCBI). </dd>
 <dt> Novel_CDS&nbsp; </dt>
     <dd> Novel protein coding genes annotated by Havana (not referenced in Entrez Gene, NCBI).</dd>
 <dt> Novel_transcript_gencode_conf&nbsp; </dt>
     <dd> Novel transcripts annotated by Havana (no ORF assigned) with at least one junction validated by RT-PCR. </dd>
 <dt> Putative_gencode_conf&nbsp; </dt>
     <dd> Putative transcripts (similar to "novel transcripts", EST supported, short, no viable ORF) with at least one junction validated by RT-PCR. It also contains exons pairs from predictions that have been validated by RT-PCR. </dd>
</dl>

<p> This annotation is the most complete annotation of the ENCODE regions.
It includes REFSEQ and ENSEMBL but contains much more, especially in
terms of alternative splicing. We refer this set as the "Genes" set. </p>

<?php hsubsec('Coding Features Annotation','Coding_Features_Annotation'); ?>

<p> For the genes described above, some coding and non coding transcripts were annotated.
The non coding transcripts are more dubious than the coding ones, as no
ORF could be determined without ambiguity (and some of the non coding transcripts can be partial).
If we need to be more conservative, we might want to use the file
<a href="ftp://genome.imim.es/pub/projects/gencode/data/havana-encode/version00.2_29apr05/ENCODE_coord/genes_with_cds/44regions_coding.gff.gz" title="Download reference annotation in GTF format">44regions_coding.gff.gz</a>.
It contains only the coding transcripts from
HAVANA-GENCODE <tt>Known</tt> and <tt>Novel_CDS</tt> categories (with coding and non
coding exons, the other categories from the previous section are non coding). We refer this set as the "Coding Genes" set. </p>

<?php hsubsec('Annotation Statistics','Annotation_Statistics'); ?>

<p> On the set "Coding Genes" set there are 439 genes, 2431
transcripts, 17520 exons and 9530 coding exons. On the set "Genes"
there are 551 genes, 2603 transcripts, 18074 exons and 9530 coding
exons (as already said in the previous section, set "Genes" contains
set "Coding Genes"). It can be seen from those figures that the number
of coding exons was the same (and they are also the same at the
coordinates level). Furthermore, there was only around 7% more
untranslated exons. </p>


<?php hsec('Gene Finding Predictions Sets','Gene_Finding_Predictions_Sets'); ?>

<p> Gene predictions were submitted under one of the seven categories described in the <a href="/gencode/workshop2005.html" title="EGASP&#39;05 workshop page">EGASP&#39;05 workshop page</a>.
A table with the number of GTF features predicted on each ENCODE sequence can be downloaded from the following links: <a href="ftp://genome.imim.es/pub/projects/gencode/data/egasp05/egasp_submissions_20050503/submissions_bysubmitter.pdf" title="Summary of submitted GTF features ordered by submitter">ordered by submitter code</a> or <a href="ftp://genome.imim.es/pub/projects/gencode/data/egasp05/egasp_submissions_20050503/submissions_bymethod.pdf" title="Summary of submitted GTF features ordered by category">ordered by prediction category</a>. </p>

<?php hsubsec('Files Submitted to EGASP&#39;05','Files_Submitted_to_EGASP05'); ?>

<p> All the submissions are available from the following link: </p>
<div class="center"> <a href="ftp://genome.imim.es/pub/projects/gencode/data/egasp05/egasp_submissions_20050503/" title="EGASP submitted sets, version 20050503">ftp://genome.imim.es/pub/projects/gencode/data/egasp05/egasp_submissions_20050503/</a> </div>
<p> They were reformatted and renamed to made them more suitable for
uploading into the ENCODE specific UCSC Genome Browser. In some cases
the predictions on the test sequence set were asked for to the
corresponding groups in order to show all the predictions on the
complete set of 44 ENCODE regions. The new files can be found at: </p>
<div class="center"> <a href="ftp://genome.imim.es/pub/projects/gencode/data/egasp05/egasp_submissions_20050621/" title="EGASP submitted sets, updated version 20050621">ftp://genome.imim.es/pub/projects/gencode/data/egasp05/egasp_submissions_20050621/</a> </div> <br />

<?php hsubsec('Post-Submissions','Post_Submissions'); ?>

<p> There are four files under this category. They basically correspond
to fixes on a previously submitted prediction set. For instance,
initial <tt>GeneMark</tt> predictions were obtained on the unmasked
sequences for the ENCODE regions. In this case, the post-submission
contains the results for <tt>GeneMark</tt> on the properly masked sequences. On the other hand, <tt>Fgenesh</tt>
predicted genes with Refseq support in reverse strand were shifted by 1
when they were transfered to GTF format. This was fixed on the latest
Softberry submitted predictions. </p>
<p> The post-submissions are available from the following link: </p>
<div class="center"> <a href="ftp://genome.imim.es/pub/projects/gencode/data/egasp05/egasp_submissions_20050503/post_predictions/" title="EGASP post-submissions, version 20050503">ftp://genome.imim.es/pub/projects/gencode/data/egasp05/egasp_submissions_20050503/post_predictions/</a> </div> <br />


<?php hsec('Evaluation Methods','Evaluation_Methods'); ?>

<table width="100%" border="0">
<tr>
<td class="center" width="320px">
<a href="figs/Accuracy_measures_setannot.png" title="Click here to get the original figure."><img src="figs/Accuracy_measures_setannot.png" alt="Click here to get the original figure." width="300px" height="125px" align="middle" border="0" /></a> <br />
<a href="figs/Accuracy_measures_setpred.png" title="Click here to get the original figure."><img src="figs/Accuracy_measures_setpred.png" alt="Click here to get the original figure." width="300px" height="104px" align="middle" border="0" /></a> <br />
</td>
<td width="380px"> 
<p> The two figures on the left sketch the exonic structure of a set of annotated
and predicted transcripts for a given locus.
You can obtain <a href="/gencode/wiki/images/f/f1/Accuracy_measures.pdf"
title="Schemma of accuracy measures at nucleotide and exon levels">from here a PDF file</a>
summarizing all the figures for the nucleotide and exon level comparisons.
The accuracy measures being used along this section are described in
Burset and Guig&oacute; <?php pmid(8786136,'<i>Genomics</i>, 34/3:353-357, 1996'); ?>,
Reese <i>et al</i> <?php pmid(10779488,'<i>Genome Research</i>, 10/4:483-501, 2000'); ?>
 and Guig&oacute; <i>et al</i> <?php pmid(11042160,'<i>Genome Research</i>, 10/10:1631-1642, 2000'); ?>. </p>
</td> 
</tr> 
</table> 

<?php hsubsec('Comparisons at Nucleotide Level','Comparisons_at_Nucleotide_Level'); ?>

<table width="100%" border="0">
<tr>
<td class="center" width="320px">
<a href="figs/Accuracy_measures_evalnucl.png" title="Click here to get the original figure."><img src="figs/Accuracy_measures_evalnucl.png" alt="Click here to get the original figure." width="300px" height="139px" align="middle" border="0" /></a> <br />
</td>
<td width="380px"> 
<p> The genic structures shown in the previous section were projected
onto a set of non-overlapping annotated/predicted nucleotide regions. Those regions
were then compared as if they were single exons in order to calculate
the corresponding <i>Sn</i>, <i>Sp</i> and <i>CC</i> measures. </p>
</td> 
</tr> 
</table> 

<!-- php hsubsubsec('IMIM','Nucleotide_Level_IMIM'); -->
<!-- php hsubsubsec('EBI','Nucleotide_Level_EBI'); -->

<?php hsubsec('Comparisons at Exon Level','Comparisons_at_Exon_Level'); ?>

<table width="100%" border="0">
<tr>
<td class="center" width="320px">
<a href="figs/Accuracy_measures_evalexon.png" title="Click here to get the original figure."><img src="figs/Accuracy_measures_evalexon.png" alt="Click here to get the original figure." width="300px" height="167px" align="middle" border="0" /></a> <br />
</td>
<td width="380px"> 
<p> For each program all
of the unique exons are determined and compared to
the unique exons in the annotation. Sensitivity is defined as the
fraction of the unique annotation exons that are predicted exactly by
each program. Specificity is the fraction of each program&#39;s unique
exons that are correct (i.e. match an annotation exon). Missed exons
are the number of unique coding exons in the annotation that are not
predicted exactly. Wrong exons are the total number of unique exons in
the prediction that do not match any coding exon in the annotation. </p>
</td> 
</tr> 
</table> 

<!-- php hsubsubsec('IMIM','Exon_Level_IMIM'); -->
<!-- php hsubsubsec('EBI','Exon_Level_EBI'); -->

<p> <b>Exon Level Predictions:</b> 
There are 4387 unique exons in the annotation and
a total of 9180 coding exons in all annotated transcripts. </p>

<p> <b>Corrections:</b> </p>
<ul>
<li> The fgenesh++ predictions were resubmitted
after the meeting to correct an off-by-one error in the creation of the
gtf. The corrected predictions are included above. The original
submission had exon sensitivity: 0.58537; exon specificity:
0.55309; missed exons: 1819; wrong exons: 2075; total exons: 5693;
unique exons: 4643. </li>
<li> The Exogean predictions presented at the meeting were not correct
because of an error in the evaluation program. The correct statistics
are posted here. </li>
<li> The GeneMark-HMM predictions were inadvertently run on unmasked
sequence. This leads to a large number of false positive predictions.
Corrected predicted were submitted after the meeting, but the original
predictions are reported here. </li>
</ul> <br />

<?php hsubsec('Comparisons at Gene/Transcript Level','Comparisons_at_Gene_Transcript_Level'); ?>

&nbsp; <br />
<table width="100%" border="0">
<tr>
<td class="center" width="450px">
<a href="figs/Accuracy_measures_gene_level.png" title="Click here to get the original figure."><img src="figs/Accuracy_measures_gene_level.s.png" alt="Click here to get the original figure." width="430px" height="125px" align="middle" border="0" /></a> <br />
</td>
</tr> 
</table> 

<?php hsubsubsec('IMIM','Gene_Transcript_IMIM'); ?>

<p> To evaluate the accuracy of alternative splicing prediction, an evaluation perl script developed by
<a href="<?php echo mailify('mailto:eeyras@imim.es?subject=evaluation.pl'); ?>"
 title="Contact Eduardo Eyras"><i>Eduardo Eyras</i></a>
was used. He provided the following description of how different
accuracy parameters are calculated by this tool. You can get in contact
with him if you are interested to obtain a copy of this evaluation
tool. </p>

<dl>
 <dt> Gene Level </dt>
  <dd> A gene is taken as a cluster of transcripts (according to
exon-overlap) in the same strand. When we compare genes, we in fact
compare clusters of transcripts. <br />
At gene level we compare all the nucleotides in the prediction and
in the annotation. We perform a projection of each set to the genome
(to eliminate the redundant base-pairs), and compare the projections
of the predictions and annotations. <br />
We also compare the exons. Similarly to the nucleotide comparison, we
extract the set of unique exons in the annotation and prediction sets
(i.e. eliminate redundancy). We then compare the set of exons and
label as found each exon that has been correctly predicted with both
splice-sites correct. <br />
Likewise, we compare the set of unique introns in each gene
prediction. We project all annotated and predicted introns into two
sets of non-redundant introns (annotated and predicted), and compare
these two sets regardless of the transcript to which they are
associated to. For this comparison, only the actual boundaries of the
intron (donor and acceptor sites) are used. This measure can give an
estimate of the accuracy of the splice-site prediction. <br />
These three measures, nucleotide, exon and intron level, give an
overall measure of the accuracy, regardless of the actual transcript
structures. For each of them we calculate the sensitivity (<i>Sn</i>), the
specificity (<i>Sp</i>), the wrong cases (<i>W</i>), as the fraction of predictions
that do not overlap any annotation, and the missing cases (<i>M</i>), as the
fraction of annotations that do not overlap any prediction. <br />
For every comparison of genes, we establish a one-to-one mapping of
transcripts in the following way. For every possible pair of
transcripts (one from the annotation and one from the prediction), we
calculate the correlationa coefficient at the nucleotide level (<i>CCn</i>),
where <i>TN</i> is calculated using the extension in the genome of these two
transcripts only. The best possible pairs can be taken as a loose
measure of accuracy at the transcript level. At this stage we also
calculate the more strict measure related to the annotated transcripts
which are exactily found by the annotation. <br /><br /> </dd>
 <dt> Transcript level </dt>
  <dd> Once this one-to-one mapping of transcripts has been established we
can produce measures at the "transcript level", which gives a better
view of the accuracy taking into account the connectivity of exons
into splicing forms. <br />
We calculate the same measures as above (<i>SN, SP, W, M</i>) for
nucleotides, exons and introns, but this time, only using the
transcript-pairs obtained by the method described above. These
measures give a better estimate of the accuracy of the predicted
exon-intron structures, whereas the gene-based comparisons described
above provide an overall performance per gene locus. <br />
All these measures are calculated per gene locus. Gene loci in the
annotation and the prediction are grouped according to overlap, and
the measures calculated for each group. Currently there is no measure
of the split and joining of genes or transcripts. <br />
A summary result is also obtained with the averaged measures.  Besides
these measures we also calculate the average exact transcripts found
per gene-pair, as an estimator of how well multiple transcripts are
found by the prediction. <br /> </dd>
</dl>

<?php hsubsubsec('EBI','Gene_Transcript_EBI'); ?>

<p> Evaluations at the Transcript and the Gene level are meant to test
the ability of the gene-prediction algorithms to correctly connect the
predicted exons into transcription structures. Genes in the Havana
annotation have an average of 2.53 (CHECK NUMBER) transcripts per gene.
Most prediction programs are only able to predict on transcript per
gene. </p>

<dl>
 <dt> Transcript Level </dt>
  <dd> Transcripts are defined by the
prediction algorithm as the connection between the coding exons and
generally run from a start codon to stop codon. Partial transcript
predictions, such as those that might arise at the end of the ENCODE
region, are also evaluated. <br />
A transcript is judged to be correct if the start and stop codon
locations are correctly predicted and the boundary of every coding exon
is correct. Non-coding exons are not considered. <br /><br /> </dd>
 <dt> Gene level </dt>
  <dd> A gene is judged to be correct if at least
one of its transcripts is predicted correctly (as defined above). The
following figures detail the relationship between gene and transcript
level statistics for several example predictions. <br /> </dd>
</dl>

<div align="center">
<table width="310px" border="0">
<tr>
<td class="center" width="310px">
<a href="figs/EBI-Example1.png" title="Click here to get the original figure."><img src="figs/EBI-Example1.png" alt="Click here to get the original figure." width="300px" height="121px" align="middle" border="0" /></a> <br /> <br />
</td>
</tr><tr>
<td class="center" width="310px">
<a href="figs/EBI-Example2.png" title="Click here to get the original figure."><img src="figs/EBI-Example2.png" alt="Click here to get the original figure." width="300px" height="128px" align="middle" border="0" /></a> <br /> <br />
</td>
</tr><tr>
<td class="center" width="310px">
<a href="figs/EBI-Example3.png" title="Click here to get the original figure."><img src="figs/EBI-Example3.png" alt="Click here to get the original figure." width="300px" height="142px" align="middle" border="0" /></a> <br /> <br />
</td>
</tr> 
</table> 
</div>


<?php hsec('EGASP&#39;05 Evaluation Results','EGASP_05_Evaluation_Results'); ?>

<p> Evaluations were obtained for each of the 44 ENCODE regions.
Furthermore, annotations and predictions for those regions were grouped
to evaluate the overall accuracy of the gene-finding tools in the
following sets: </p>

<ul>
 <li> <b>EN_TRN13</b>.- The 13 regions that were used for training all
the gene-finding software, also known as the <b>training set</b>. </li>
 <li> <b>EN_PRD31</b>.- The 31 regions on which the predictions for
the GENCODE workshop were made, also known as the <b>test set</b>. </li>
 <li> <b>EN_ALL44</b>.- A "summary" for all the ENCODE regions (in
few cases incomplete because the submitters only provided the
predictions for the test set). </li>
 <li> <b>EN_MNL14</b>.- The set of all manual sequence picks containing genes of interest. </li>
 <li> <b>EN_RND30</b>.- The set of all randomly choosen ENCODE sequences. </li>
 <li> <b>EN_MNLp12</b>.- Manual sequence picks from the predictions set. </li>
 <li> <b>EN_RNDp19</b>.- Randomly choosen sequences from the predictions set. </li>
</ul>

<p> One of the advantages of grouping the annotations/predictions in the
way shown above, is that the final evaluation takes into account those
regions in which the programs predicted genes but there were no
annotated genes and viceversa. Regions <b>ENr112</b>, <b>ENr311</b> and <b>ENr313</b>,
are a good example of sequences without manually-curated annotated
genes for which some gene-finders produced predictions. Moreover, the
multisequence sets are also usefull to detect possible biases on the
accuracy of the predictions due to the samples being used. </p>

<p> Annotation for the following 13 regions was released before the
workshop (i.e. these regions are considered the training set): ENm004,
ENm006, ENr111, ENr114, ENr132, ENr222, ENr223, ENr231, ENr232, ENr323,
ENr324, ENr333, ENr334. As noted above, these regions are not
necessarily reflective of the entire set of ENCODE sequences. </p>

<p> The following files contain the complete set of measures computed
at different levels, for each submission and each sequence, in plain
ASCII tabular format. The column names from the first row of those
tables are also described here.</p>

<div align="center"><a name="tableresults"></a>
<table style="padding: 5px 10px; border: #000000 dotted 1px;">
<tr><th align="center">ANALYSIS SET</th><td>&nbsp;</td>
    <th align="center">SCORES</th><td>&nbsp;</td>
    <th align="center">DESCRIPTION</th></tr>
<tr><th align="right" style="border-top: #c1c1c1 solid 1px; padding-top: 5px">EBI on Havana Coding Genes Set</th><td width="8px" rowspan="3">&nbsp;</td>
    <td align="center" style="border-top: #c1c1c1 solid 1px;"><a href="tables/paul_summariesbyseq.tbl">TBL</a></td><td width="8px" rowspan="3">&nbsp;</td>
    <td align="center" style="border-top: #c1c1c1 solid 1px;"><a href="tables/desc_paul_summariesbyseq.txt">TXT</a></td></tr>
<tr><th align="right" style="border-top: #c1c1c1 solid 1px; padding-top: 5px">IMIM on Havana Coding Genes Set</th>
    <td align="center" style="border-top: #c1c1c1 solid 1px;"><a href="tables/full_summary.codingexons+exons.progs-x-seqs.HAVANA_CODING_GENES.tbl">TBL</a></td>
    <td align="center" rowspan="2" style="border-top: #c1c1c1 solid 1px;"><a href="tables/desc_full_summary.codingexons+exons.progs-x-seqs.txt">TXT</a></td></tr>
<tr><th align="right">IMIM on Havana Genes Set</th>
    <td align="center"><a href="tables/full_summary.codingexons+exons.progs-x-seqs.HAVANA_GENES.tbl">TBL</a></td></tr>
</table>
</div>

<p> <i>(Sn+Sp)/2</i> figures appearing on the paper were produced from
the EBI data. The following sections provide the same analysis on the
two datasets by IMIM, including analysis on more exonic features, such
as splice sites. One column has been computed for the "Coding Genes"
Havana set and the other column was derived from the "Genes" Havana
set. For completeness shake, we include here the "Genes"
evaluation. When comparing all the exons (coding and non-coding), any
difference in the evaluation results can be due to the facts that the
"Genes" set contains the "Coding Genes" set and that it includes 112
genes more (along with their corresponding 172 transcripts). Those
genes are made of non-coding exons, thus major differences can be
expected when comparing all predicted exons against all those
annotated exons. However, the figures below show that the evaluation
results are quite similar for the two annotation sets. </p>

<?php hsubsec('Nucleotide Level','Results_Nucleotide_Level'); ?>

<div align="center">
<table style="padding: 10px 15px; border: 0px;">
<tr><th align="center"><b>Feature</b></th><th align="center" colspan="2"><b>Havana Coding Genes Dataset</b></th><th align="center"><b>Havana Genes Dataset</b></th></tr>
<?php mkplotrow('CDS', 'Nucleotide_CDS_NCSN-NCSP', 'Nucleotide_CDS_NCSN-NCSP', 'Nucleotide_CDS_NCSN-NCSP'); 
      mkplotrow('mRNA','Nucleotide_mRNA_NESN-NESP','Nucleotide_mRNA_NESN-NESP','Nucleotide_mRNA_NESN-NESP'); ?>
</table>
</div>

<?php hsubsec('Coding Exon Level','Results_CodingExon_Level'); ?>

<p> It has been already stated and it can be observed in the previous
section; at the coding features level, there are no differences between
the evaluation results from comparing predictions against the "Coding
Genes" or the "Genes" Havana datatsets. It has been explained in the
<a href="#Havana_Curated_Datasets">Havana Curated Datasets</a> section,
that the "Coding Genes" dataset is a subset of the "Genes"
one. Therefore, here you can only find the results for the first
dataset. </p>

<div align="center">
<table style="padding: 10px 15px; border: 0px;">
<tr><th align="center"><b>Feature</b></th><th align="center" colspan="2"><b>Havana Coding Genes Dataset</b></th><th align="center">&nbsp;</th></tr>
<?php mkplotrow('All Coding Exons','Coding_Exon_ECSNCDS-ECSPCDS','Coding_Exon_ECSNCDS-ECSPCDS', ''); 
      mkplotrow('Introns within<br />Coding Exons','Intron_CDS_ECSNcIN-ECSPcIN','Intron_CDS_ECSNcIN-ECSPcIN', ''); 
      mkplotrow('First Exon','Coding_Exons:_First_ECSNfEx-ECSPfEx','Coding_Exons:_First_ECSNfEx-ECSPfEx',''); 
      mkplotrow('Internal Exon','Coding_Exons:_Internal_ECSNiEx-ECSPiEx','Coding_Exons:_Internal_ECSNiEx-ECSPiEx',''); 
      mkplotrow('Terminal Exon','Coding_Exons:_Terminal_ECSNtEx-ECSPtEx','Coding_Exons:_Terminal_ECSNtEx-ECSPtEx',''); 
      mkplotrow('Single Exon','Coding_Exons:_Single_ECSNsEx-ECSPsEx','Coding_Exons:_Single_ECSNsEx-ECSPsEx','');
      mkplotrow('Start Codon','Start_Codon_ECSNatg-ECSPatg','Start_Codon_ECSNatg-ECSPatg',''); 
      mkplotrow('Stop Codon','Stop_Codon_ECSNstp-ECSPstp','Stop_Codon_ECSNstp-ECSPstp',''); ?>
</table>
</div>

<p> Sample sizes are smaller for the "Single" exons case. This is
reflected in the shape of the boxplots. On the other hand, from the above figures
one can see that "Internal" exons is the class having better results
than "First" or "Terminal" exons. </p>

<?php hsubsec('Exon Level','Results_Exon_Level'); ?>

<div align="center">
<table style="padding: 10px 15px; border: 0px;">
<tr><th align="center"><b>Feature</b></th><th align="center" colspan="2"><b>Havana Coding Genes Dataset</b></th><th align="center"><b>Havana Genes Dataset</b></th></tr>
<?php mkplotrow('All Exons','Exon_ECSNeEx-ECSPeEx','Exon_ECSNeEx-ECSPeEx','Exon_ECSNeEx-ECSPeEx'); 
      mkplotrow('All Introns','Intron_mRNA_ECSNeIN-ECSPeIN','Intron_mRNA_ECSNeIN-ECSPeIN','Intron_mRNA_ECSNeIN-ECSPeIN'); 
      mkplotrow('Transcription<br />Start Site (TSS)','Transcription_Start_Site_ECSNtss-ECSPtss','Transcription_Start_Site_ECSNtss-ECSPtss','Transcription_Start_Site_ECSNtss-ECSPtss'); 
      mkplotrow('Transcription<br />End Site (TSS)','Trascription_End_Site_ECSNtes-ECSPtes','Trascription_End_Site_ECSNtes-ECSPtes','Trascription_End_Site_ECSNtes-ECSPtes'); ?>
</table>
</div>

<?php hsubsec('Splice Sites','Results_Splice_Sites_Level'); ?>

<div align="center">
<table style="padding: 10px 15px; border: 0px;">
<tr><th align="center"><b>Feature</b></th><th align="center" colspan="2"><b>Havana Coding Genes Dataset</b></th><th align="center"><b>Havana Genes Dataset</b></th></tr>
<?php mkplotrow('Donor Site<br />(Coding Exons)','Donor_Coding_Exons_ECSNcDn-ECSPcDn','Donor_Coding_Exons_ECSNcDn-ECSPcDn',''); 
      mkplotrow('Donor Site<br />(All Exons)','Donor_All_Exons_ECSNeDn-ECSPeDn','Donor_All_Exons_ECSNeDn-ECSPeDn','Donor_All_Exons_ECSNeDn-ECSPeDn'); 
      mkplotrow('Acceptor Site<br />(Coding Exons)','Acceptor_Coding_Exons_ECSNcAc-ECSPcAc','Acceptor_Coding_Exons_ECSNcAc-ECSPcAc',''); 
      mkplotrow('Acceptor Site<br />(All Exons)','Acceptor_All_Exons_ECSNeAc-ECSPeAc','Acceptor_All_Exons_ECSNeAc-ECSPeAc','Acceptor_All_Exons_ECSNeAc-ECSPeAc'); ?>
</table>
</div>

<?php hsubsec('Gene/Transcript Level','Results_Gene_Transcript_Level'); ?>

<div align="center">
<table style="padding: 10px 15px; border: 0px;">
<tr><th align="center"><b>Feature</b></th><th align="center" colspan="2"><b>Havana Coding Genes Dataset</b></th><th align="center">&nbsp;</th></tr>
<?php mkplotrow('Transcript','Transcript_Coding_Exons_GCSNt-GCSPt','Transcript_GCSNt-GCSPt',''); 
      mkplotrow('Gene','Gene_Coding_Exons_GCSNg-GCSPg','Gene_GCSNg-GCSPg',''); ?>
</table>
</div>

<p> We were unable to properly test all the changes made to the original Eduardo's evaluation perl script before submission. The boxplots in this section already show that there is a problem when grouping features at transcript level (i.e., for <tt>Ensembl</tt> and <tt>GeneMark.hmm</tt> columns) and some bug leading to wrong sensitivity and specificity values at gene level (above 1 in some cases). Therefore, we rely on the results, at gene and transcript levels, shown in the paper; which were produced from the EBI evaluation summary file (see the <a href="#tableresults">table listing the corresponding evaluation files</a>). Furthermore, evaluations for six sequence sets, those for gene-density and mouse-homology pseudo-sequences, were not performed. This explains why the six rightmost columns in the above sequence boxplots are empty. </p>

<?php hsec('EGASP&#39;05 Evaluations from Participants','EGASP_05_Evaluations_from_Participants'); ?>

<p> </p>

<?php hsubsec('Mario Stanke','Mario_Stanke'); ?>

<p> This section summarizes the evaluation results on geneid, augustus,
genscan, genemark and genezilla on the 31 ENCODE test regions.
From each prediction and the annotation, only the CDS lines were kept
for which both the end and begin coordinate was larger than 0 and at
most the length of the sequence. So that, every exon (partially)
outside of the region was discarded. Then, the stop codons to all
predictions were added, as the CDS in the annotation contained the stop
codons. The evaluations were produced using the program
<a href="soft/" title="Download Keibler's evaluation software"><tt>eval</tt>
by Evan Keibler</a> <?php pmid(14565849, 'Keibler and Brent, <i>BMC Bioinformatics</i> 4:50. 2003'); ?>. </p>

<ul style="margin-left: 200px;">
 <li> <a href="tables/TestEncode.GeneMark.abinitio.pdf" title="TestEncode.GeneMark.abinitio.pdf"> TestEncode.GeneMark.abinitio.pdf</a> </li> 
 <li> <a href="tables/TestEncode.GeneZilla.abinitio.pdf" title="TestEncode.GeneZilla.abinitio.pdf"> TestEncode.GeneZilla.abinitio.pdf</a> </li> 
 <li> <a href="tables/TestEncode.augustus.abinitio.pdf" title="TestEncode.augustus.abinitio.pdf"> TestEncode.augustus.abinitio.pdf</a> </li> 
 <li> <a href="tables/TestEncode.geneid.abinitio.pdf" title="TestEncode.geneid.abinitio.pdf"> TestEncode.geneid.abinitio.pdf</a> </li> 
 <li> <a href="tables/TestEncode.genscan.abinitio.pdf" title="TestEncode.genscan.abinitio.pdf"> TestEncode.genscan.abinitio.pdf</a> </li> 
</ul>


<?php page_trailer(__FILE__); ?>
