Gene model curation stats

From DictyWiki

Jump to: navigation, search

Contents


SQL_Queries


Select gene name, gene synonym and protein synonym, DDB_G ID, all genes (without ps)

You can order by gsyn instead of gene.name, or both separated by a comma; this query excludes pseudogenes:

WITH dicty_coding_genes AS (
SELECT dbxref.accession gene_id
FROM cgm_chado.feature gene
JOIN organism ON organism.organism_id=gene.organism_id
JOIN dbxref on dbxref.dbxref_id=gene.dbxref_id
JOIN cgm_chado.cvterm gtype on gtype.cvterm_id=gene.type_id
JOIN cgm_chado.feature_relationship frel ON frel.object_id=gene.feature_id
JOIN cgm_chado.feature mrna ON frel.subject_id=mrna.feature_id
JOIN cgm_chado.cvterm mtype ON mtype.cvterm_id=mrna.type_id
WHERE gtype.name='gene'
AND mtype.name='mRNA'
AND organism.common_name = 'dicty'
AND gene.name NOT LIKE '%\_ps%'
ESCAPE '\'
group by dbxref.accession
)

SELECT gene.name,wm_concat(syn.name) gsyn,dg.gene_id gene_id
FROM cgm_chado.feature gene
JOIN cgm_chado.dbxref on gene.dbxref_id=dbxref.dbxref_id
JOIN dicty_coding_genes dg on dg.gene_id=dbxref.accession
LEFT JOIN cgm_chado.feature_synonym fsyn on gene.feature_id=fsyn.feature_id
LEFT JOIN cgm_chado.synonym_ syn on syn.synonym_id=fsyn.SYNONYM_ID
group by gene.name,dg.gene_id
order by gene.name DESC

To exclude deleted genes:

WITH dicty_coding_genes AS (
SELECT dbxref.accession gene_id
FROM cgm_chado.feature gene
JOIN organism ON organism.organism_id=gene.organism_id
JOIN dbxref on dbxref.dbxref_id=gene.dbxref_id
JOIN cgm_chado.cvterm gtype on gtype.cvterm_id=gene.type_id
JOIN cgm_chado.feature_relationship frel ON frel.object_id=gene.feature_id
JOIN cgm_chado.feature mrna ON frel.subject_id=mrna.feature_id
JOIN cgm_chado.cvterm mtype ON mtype.cvterm_id=mrna.type_id
WHERE gtype.name='gene'
AND mtype.name='mRNA'
AND gene.is_deleted = 0
AND organism.common_name = 'dicty'
AND gene.name NOT LIKE '%\_ps%'
ESCAPE '\'
group by dbxref.accession
)

SELECT gene.name,wm_concat(syn.name) gsyn,dg.gene_id gene_id
FROM cgm_chado.feature gene
JOIN cgm_chado.dbxref on gene.dbxref_id=dbxref.dbxref_id
JOIN dicty_coding_genes dg on dg.gene_id=dbxref.accession
LEFT JOIN cgm_chado.feature_synonym fsyn on gene.feature_id=fsyn.feature_id
LEFT JOIN cgm_chado.synonym_ syn on syn.synonym_id=fsyn.SYNONYM_ID
group by gene.name,dg.gene_id
order by gene.name DESC

To include pseudogenes:

WITH dicty_coding_genes AS (
SELECT dbxref.accession gene_id
FROM cgm_chado.feature gene
JOIN organism ON organism.organism_id=gene.organism_id
JOIN dbxref on dbxref.dbxref_id=gene.dbxref_id
JOIN cgm_chado.cvterm gtype on gtype.cvterm_id=gene.type_id
JOIN cgm_chado.feature_relationship frel ON frel.object_id=gene.feature_id
JOIN cgm_chado.feature mrna ON frel.subject_id=mrna.feature_id
JOIN cgm_chado.cvterm mtype ON mtype.cvterm_id=mrna.type_id
WHERE gtype.name='gene'
AND mtype.name='mRNA'
AND gene.is_deleted = 0
AND organism.common_name = 'dicty'

group by dbxref.accession
)

SELECT gene.name,wm_concat(syn.name) gsyn,dg.gene_id gene_id
FROM cgm_chado.feature gene
JOIN cgm_chado.dbxref on gene.dbxref_id=dbxref.dbxref_id
JOIN dicty_coding_genes dg on dg.gene_id=dbxref.accession
LEFT JOIN cgm_chado.feature_synonym fsyn on gene.feature_id=fsyn.feature_id
LEFT JOIN cgm_chado.synonym_ syn on syn.synonym_id=fsyn.SYNONYM_ID
group by gene.name,dg.gene_id
order by gene.name DESC

Number of curated models

SELECT COUNT(*) as c from cgm_chado.V_VERIFIED_GENE_FEATURES v
  • You can also get table with created_by etc by doing just SELECT * from....
  • Alternative query of gene ID and gene name:
SELECT gene.name cgene_name, gacc.accession cgene_id
   FROM cgm_chado.feature JOIN feature_dbxref fxref ON fxref.feature_id = feature.feature_id 
   JOIN cgm_chado.dbxref ON dbxref.dbxref_id = fxref.dbxref_id 
   JOIN cgm_chado.db ON db.db_id = dbxref.db_id 
   JOIN cgm_chado.cvterm ON cvterm.cvterm_id = feature.TYPE_ID 
   JOIN cgm_chado.feature_relationship frel ON frel.subject_id = feature.feature_id 
   JOIN cgm_chado.feature gene ON gene.feature_id = frel.object_id
   JOIN cgm_chado.dbxref gacc ON gene.dbxref_id=gacc.dbxref_id 
   JOIN cgm_chado.cvterm gtype ON gtype.cvterm_id = gene.TYPE_ID 
   JOIN cgm_chado.feature_relationship frel2 ON frel2.object_id = feature.feature_id 
   JOIN cgm_chado.feature poly ON poly.feature_id = frel2.subject_id 
   JOIN cgm_chado.cvterm ptype ON ptype.cvterm_id = poly.TYPE_ID
   WHERE cvterm.name = 'mRNA'
   AND ptype.name = 'polypeptide'
   AND dbxref.accession = 'dictyBase Curator'
   AND gtype.name = 'gene'
   AND db.name = 'GFF_source'
   AND feature.is_deleted = 0

Gene models not yet curated

Genes that are not TE/RTEs, not pseudogenes, not RNAs, and not skipped.

SELECT *
FROM cgm_chado.v_gene_features g
INNER JOIN cgm_chado.organism o   ON g.organism_id    = o.organism_id
WHERE o.common_name = 'dicty'
AND g.name not like '%_TE'
AND g.name NOT LIKE '%_RTE'
AND EXISTS
(SELECT 'a' FROM cgm_chado.v_mrna_features f1
INNER JOIN cgm_chado.feature_relationship frel1 ON f1.feature_id = frel1.subject_id
INNER JOIN cgm_chado.cvterm ct1                 ON ct1.cvterm_id = frel1.type_id

WHERE frel1.object_id = g.feature_id
AND ct1.name          = 'part_of'
)
AND NOT EXISTS
(SELECT 'a' FROM cgm_chado.v_notdeleted_feature f2
INNER JOIN cgm_chado.feature_relationship frel2 ON f2.feature_id = frel2.subject_id
INNER JOIN cgm_chado.cvterm ct2                 ON ct2.cvterm_id = frel2.type_id
INNER JOIN cgm_chado.cvterm t2                  ON f2.type_id    = t2.cvterm_id
WHERE frel2.object_id = g.feature_id
AND ct2.name          = 'part_of'
AND t2.name LIKE 'pseudogene'
)
AND NOT EXISTS
(SELECT 'a' FROM cgm_chado.v_verified_gene_features f3
INNER JOIN cgm_chado.feature_relationship frel3 ON f3.feature_id = frel3.subject_id
INNER JOIN cgm_chado.cvterm ct3                 ON ct3.cvterm_id = frel3.type_id
INNER JOIN cgm_chado.cvterm t3                  ON f3.type_id    = t3.cvterm_id
WHERE frel3.object_id = g.feature_id
AND ct3.name          = 'part_of'
AND t3.name LIKE 'mRNA'
)
AND NOT EXISTS (
 select * from cgm_chado.featureprop fp
 inner join cvterm ct4 on fp.type_id = ct4.cvterm_id
 where fp.feature_id = g.feature_id
 and ct4.name = 'public note'
 and fp.value like '%inadequate support%'
);


Number of protein coding genes

All: complete, incomplete, skipped, TE/RTEs

SELECT count(*) as c
FROM cgm_chado.v_gene_features g
INNER JOIN cgm_chado.organism o   ON g.organism_id    = o.organism_id
WHERE o.common_name = 'dicty'
AND EXISTS
 (SELECT 'a' FROM cgm_chado.v_primary_feature f1
 INNER JOIN cgm_chado.feature_relationship frel1 ON f1.feature_id = frel1.subject_id
 INNER JOIN cgm_chado.cvterm ct1                 ON ct1.cvterm_id = frel1.type_id
 INNER JOIN cgm_chado.cvterm t1                  ON f1.type_id    = t1.cvterm_id
 WHERE frel1.object_id = g.feature_id
 AND ct1.name          = 'part_of'
 AND t1.name LIKE 'mRNA'
 )
AND NOT EXISTS
 (SELECT 'a' FROM cgm_chado.v_notdeleted_feature f2
 INNER JOIN cgm_chado.feature_relationship frel2 ON f2.feature_id = frel2.subject_id
 INNER JOIN cgm_chado.cvterm ct2                 ON ct2.cvterm_id = frel2.type_id
 INNER JOIN cgm_chado.cvterm t2                  ON f2.type_id    = t2.cvterm_id
 WHERE frel2.object_id = g.feature_id
 AND ct2.name          = 'part_of'
 AND t2.name LIKE 'pseudogene'
 ) ;

Number of pseudogenes

SELECT COUNT(*) as c
      FROM CGM_CHADO.V_NOTDELETED_FEATURE F
INNER JOIN CGM_CHADO.CVTERM CT
        ON CT.CVTERM_ID = F.TYPE_ID
     WHERE CT.NAME = 'pseudogene'

Number of deleted genes with IDs

SELECT dx.accession
     FROM CGM_CHADO.V_DELETED_FEATURE F
INNER JOIN CGM_CHADO.CVTERM CT
     ON CT.CVTERM_ID = F.TYPE_ID
INNER JOIN cgm_chado.dbxref dx
     ON f.dbxref_id = dx.dbxref_id
WHERE ct.name     = 'gene'

To just query the number use:

SELECT COUNT(*) as c
      FROM CGM_CHADO.V_DELETED_FEATURE F
INNER JOIN CGM_CHADO.CVTERM CT
        ON CT.CVTERM_ID = F.TYPE_ID
WHERE ct.name     = 'gene'

Genes with conflicting evidence

This query can also be used to find genes with support notes, such as 'genomic context'.

SELECT v.name,
       p.value 
  FROM cgm_chado.featureprop p inner 
  JOIN cgm_chado.V_VERIFIED_GENE_FEATURES v on p.feature_id=v.feature_id 
 WHERE p.value like 'Conflicting evidence'

Genes with 'complete support'

 
SELECT v.name, p.value
     
  FROM cgm_chado.featureprop p inner 
  JOIN cgm_chado.V_VERIFIED_GENE_FEATURES v on p.feature_id=v.feature_id 
where  p.value like 'ESTs' or p.value like 'sequence similarity'

AND not v.name in
(SELECT v.name
  FROM cgm_chado.featureprop p inner 
  JOIN cgm_chado.V_VERIFIED_GENE_FEATURES v on p.feature_id=v.feature_id 
 WHERE p.value like 'Conflicting evidence' or p.value like 'Incomplete support') 

Genes with 'incomplete supprt'

SELECT count(*) as c from cgm_chado.V_VERIFIED_GENE_FEATURES v
join cgm_chado.featureprop p on p.feature_id=v.feature_id
where p.value like 'Incomplete support'

This works the same for genes with 'genomic context', or get both by adding an 'or' between the p.values.


Skipped genes

Also provides curator note

SELECT d.gene_name , p.value FROM cgm_chado.featureprop p JOIN cgm_chado.v_gene_dictybaseid d on 
d.GENE_FEATURE_ID= p.FEATURE_ID WHERE p.type_id=994 AND value like '%inadequate%'

Modified gene models inferred by curator and date

Replace 'CGM_DDB_BOBD' with CGM_DDB_PFEY or CGM_DDB_PASC or delete the string 'v.created_by ='CGM_DDB_BOBD' and' to get ALL

SELECT v.name , v.timelastmodified
 FROM cgm_chado.V_VERIFIED_GENE_FEATURES v
join cgm_chado.featureprop p on p.feature_id=v.feature_id
WHERE v.created_by ='CGM_DDB_BOBD' and p.value like 'curator inference'
ORDER BY v.timelastmodified

Number of curated models with intron number

Inner query (starting from 'SELECT f.uniquename' to 'GROUP BY f.uniquename') gives DDB ids and number of introns for all curated models

SELECT COUNT(*) as c,
 introns
FROM
 (SELECT f.uniquename,
   COUNT(*)-1 AS introns
 FROM cgm_chado.v_verified_gene_features f
 INNER JOIN cgm_chado.cvterm fcv                ON f.type_id = fcv.cvterm_id
 INNER JOIN cgm_chado.featureloc floc           ON f.feature_id = floc.feature_id
 INNER JOIN cgm_chado.feature_relationship frel ON f.feature_id = frel.object_id
 INNER JOIN cgm_chado.feature e                 ON e.feature_id = frel.subject_id
 INNER JOIN cgm_chado.cvterm ecv                ON e.type_id   = ecv.cvterm_id
 WHERE fcv.name = 'mRNA'
 AND ecv.name   = 'exon'
 GROUP BY f.uniquename
 )
GROUP BY introns

Number of curated models with certain length

SELECT COUNT(*) AS c,
 model_length
FROM
 (SELECT f.uniquename,
   (floc.fmax - floc.fmin) AS model_length
 FROM cgm_chado.v_verified_gene_features f
 INNER JOIN cgm_chado.cvterm fcv      ON f.type_id = fcv.cvterm_id
 INNER JOIN cgm_chado.featureloc floc ON f.feature_id = floc.feature_id
 WHERE fcv.name LIKE 'mRNA'
 )
GROUP BY model_length
ORDER BY model_length

number of proteins (curated) with certain length

SELECT COUNT(*) as c,
 protein_length
FROM
 (SELECT p.uniquename,
  length(p.residues) AS protein_length
  FROM cgm_chado.v_polypeptide_features p
   WHERE EXISTS
     (SELECT 'a'
     FROM cgm_chado.v_verified_gene_features f
     INNER JOIN cgm_chado.feature_relationship r ON f.feature_id = r.object_id
     INNER JOIN cgm_chado.cvterm rcv             ON r.type_id = rcv.cvterm_id
     INNER JOIN cgm_chado.cvterm fcv             ON f.type_id       = fcv.cvterm_id
     WHERE r.subject_id = p.feature_id
     AND rcv.name LIKE 'derived_from'
     AND fcv.name LIKE 'mRNA'
     )
 )
GROUP BY protein_length
ORDER BY protein_length

Number of genes resulting from split

This query does not give the full list. Older genes are not included!!

  • Count
WITH split_replaced AS(
select dbxref.accession acc from cgm_chado.featureprop fprop
join cgm_chado.cvterm on cvterm.cvterm_id=fprop.type_id
join cgm_chado.feature on feature.feature_id=fprop.feature_id
join cgm_chado.cvterm ftype on ftype.cvterm_id=feature.type_id
join cgm_chado.dbxref on feature.dbxref_id=dbxref.dbxref_id
where ftype.name = 'gene'
and cvterm.name = 'replaced by'
and feature.is_deleted = 1
and feature.uniquename like 'DDB_G%'
group by dbxref.accession
having count(to_char(fprop.value)) > 1
)

select count(to_char(fprop.value)) split_id from cgm_chado.feature
join cgm_chado.featureprop fprop on fprop.feature_id=feature.feature_id
join cgm_chado.dbxref on dbxref.dbxref_id=feature.dbxref_id
join cgm_chado.cvterm on cvterm.cvterm_id=fprop.type_id
join split_replaced on split_replaced.acc=dbxref.accession
where cvterm.name = 'replaced by'
and fprop.value like 'DDB_G%'
  • List of gene ids
WITH split_replaced AS(
select dbxref.accession acc from cgm_chado.featureprop fprop
join cgm_chado.cvterm on cvterm.cvterm_id=fprop.type_id
join cgm_chado.feature on feature.feature_id=fprop.feature_id
join cgm_chado.cvterm ftype on ftype.cvterm_id=feature.type_id
join cgm_chado.dbxref on feature.dbxref_id=dbxref.dbxref_id
where ftype.name = 'gene'
and cvterm.name = 'replaced by'
and feature.is_deleted = 1
and feature.uniquename like 'DDB_G%'
group by dbxref.accession
having count(to_char(fprop.value)) > 1
)

select to_char(fprop.value) split_id from cgm_chado.feature
join cgm_chado.featureprop fprop on fprop.feature_id=feature.feature_id
join cgm_chado.dbxref on dbxref.dbxref_id=feature.dbxref_id
join cgm_chado.cvterm on cvterm.cvterm_id=fprop.type_id
join split_replaced on split_replaced.acc=dbxref.accession
where cvterm.name = 'replaced by'
and fprop.value like 'DDB_G%'
  • List of split gene ids (deleted)

select dbxref.accession deleted_id from cgm_chado.featureprop fprop
join cgm_chado.cvterm on cvterm.cvterm_id=fprop.type_id
join cgm_chado.feature on feature.feature_id=fprop.feature_id
join cgm_chado.cvterm ftype on ftype.cvterm_id=feature.type_id
join cgm_chado.dbxref on feature.dbxref_id=dbxref.dbxref_id
where ftype.name = 'gene'
and cvterm.name = 'replaced by'
and feature.is_deleted = 1
and feature.uniquename like 'DDB_G%'
group by dbxref.accession
having count(to_char(fprop.value)) > 1


Number of merged genes

  • Count
WITH replaced_model AS
  (SELECT DISTINCT dbxref.accession gene_id
   FROM cgm_chado.feature gene,
     cgm_chado.cvterm,
     cgm_chado.dbxref,
      (SELECT to_char(SUBSTR(fprop.VALUE,    1,    2000)) gene_id
     FROM cgm_chado.featureprop fprop JOIN cgm_chado.cvterm ON cvterm.cvterm_id = fprop.TYPE_ID 
     JOIN cgm_chado.cv ON cv.cv_id = cvterm.cv_id 
     JOIN cgm_chado.feature ON feature.feature_id = fprop.feature_id
     WHERE cvterm.name = 'replaced by'
     AND cv.name = 'autocreated'
     AND feature.uniquename NOT LIKE 'M%')
  live_gene
   WHERE cvterm.cvterm_id = gene.TYPE_ID
   AND dbxref.dbxref_id = gene.dbxref_id
   AND live_gene.gene_id = dbxref.accession
   AND cvterm.name = 'gene'),
  merged_model AS
  (SELECT replaced_model.gene_id merge_ids
   FROM cgm_chado.feature cfeat JOIN cgm_chado.feature_dbxref fxref ON fxref.feature_id = cfeat.feature_id 
  JOIN cgm_chado.dbxref ON dbxref.dbxref_id = fxref.dbxref_id 
  JOIN cgm_chado.feature_relationship frel ON frel.subject_id = cfeat.feature_id 
  JOIN cgm_chado.feature gene ON frel.object_id = gene.feature_id 
  JOIN cgm_chado.dbxref gxref ON gxref.dbxref_id = gene.dbxref_id 
   JOIN replaced_model ON gxref.accession = replaced_model.gene_id
   WHERE dbxref.accession = 'Sequencing Center'
   GROUP BY replaced_model.gene_id
    HAVING COUNT(cfeat.uniquename) > 1)

SELECT count(*) as c
FROM merged_model

  • List of gene ids
WITH replaced_model AS
  (SELECT DISTINCT dbxref.accession gene_id
   FROM cgm_chado.feature gene,
     cgm_chado.cvterm,
     cgm_chado.dbxref,
      (SELECT to_char(SUBSTR(fprop.VALUE,    1,    2000)) gene_id
     FROM cgm_chado.featureprop fprop JOIN cgm_chado.cvterm ON cvterm.cvterm_id = fprop.TYPE_ID 
     JOIN cgm_chado.cv ON cv.cv_id = cvterm.cv_id 
     JOIN cgm_chado.feature ON feature.feature_id = fprop.feature_id
     WHERE cvterm.name = 'replaced by'
     AND cv.name = 'autocreated'
     AND feature.uniquename NOT LIKE 'M%')
  live_gene
   WHERE cvterm.cvterm_id = gene.TYPE_ID
   AND dbxref.dbxref_id = gene.dbxref_id
   AND live_gene.gene_id = dbxref.accession
   AND cvterm.name = 'gene'),
  merged_model AS
  (SELECT replaced_model.gene_id merge_ids
   FROM cgm_chado.feature cfeat JOIN cgm_chado.feature_dbxref fxref ON fxref.feature_id = cfeat.feature_id 
  JOIN cgm_chado.dbxref ON dbxref.dbxref_id = fxref.dbxref_id 
  JOIN cgm_chado.feature_relationship frel ON frel.subject_id = cfeat.feature_id 
  JOIN cgm_chado.feature gene ON frel.object_id = gene.feature_id 
  JOIN cgm_chado.dbxref gxref ON gxref.dbxref_id = gene.dbxref_id 
   JOIN replaced_model ON gxref.accession = replaced_model.gene_id
   WHERE dbxref.accession = 'Sequencing Center'
   GROUP BY replaced_model.gene_id
    HAVING COUNT(cfeat.uniquename) > 1)

SELECT *
FROM merged_model

Number of genes with coordinate changes

Number of curated genes approved from geneid predictions

Number of gene models changed (from SC prediction)

Delete the COUNT in the last SELECT statement to get a list of genes IDs; delete the ! in the last line to get all gene models that are the same as prediction (equal).

WITH curated AS
  (SELECT gacc.accession cgene_id, 
     poly.md5checksum checksum
   FROM cgm_chado.feature JOIN feature_dbxref fxref ON fxref.feature_id = feature.feature_id 
   JOIN cgm_chado.dbxref ON dbxref.dbxref_id = fxref.dbxref_id 
   JOIN cgm_chado.db ON db.db_id = dbxref.db_id 
   JOIN cgm_chado.cvterm ON cvterm.cvterm_id = feature.TYPE_ID 
   JOIN cgm_chado.feature_relationship frel ON frel.subject_id = feature.feature_id 
   JOIN cgm_chado.feature gene ON gene.feature_id = frel.object_id
   JOIN cgm_chado.dbxref gacc ON gene.dbxref_id=gacc.dbxref_id 
   JOIN cgm_chado.cvterm gtype ON gtype.cvterm_id = gene.TYPE_ID 
   JOIN cgm_chado.feature_relationship frel2 ON frel2.object_id = feature.feature_id 
   JOIN cgm_chado.feature poly ON poly.feature_id = frel2.subject_id 
   JOIN cgm_chado.cvterm ptype ON ptype.cvterm_id = poly.TYPE_ID
   WHERE cvterm.name = 'mRNA'
   AND ptype.name = 'polypeptide'
   AND dbxref.accession = 'dictyBase Curator'
   AND gtype.name = 'gene'
   AND db.name = 'GFF_source'
   AND feature.is_deleted = 0),
  predicted AS
  (SELECT gacc.accession cgene_id,
     poly.md5checksum checksum
   FROM cgm_chado.feature JOIN feature_dbxref fxref ON fxref.feature_id = feature.feature_id 
   JOIN cgm_chado.dbxref ON dbxref.dbxref_id = fxref.dbxref_id 
   JOIN cgm_chado.db ON db.db_id = dbxref.db_id 
   JOIN cgm_chado.cvterm ON cvterm.cvterm_id = feature.TYPE_ID 
   JOIN cgm_chado.feature_relationship frel ON frel.subject_id = feature.feature_id 
   JOIN cgm_chado.feature gene ON gene.feature_id = frel.object_id
   JOIN cgm_chado.dbxref gacc ON gene.dbxref_id=gacc.dbxref_id 
   JOIN cgm_chado.cvterm gtype ON gtype.cvterm_id = gene.TYPE_ID 
   JOIN cgm_chado.feature_relationship frel2 ON frel2.object_id = feature.feature_id 
   JOIN cgm_chado.feature poly ON poly.feature_id = frel2.subject_id 
   JOIN cgm_chado.cvterm ptype ON ptype.cvterm_id = poly.TYPE_ID
   WHERE cvterm.name = 'mRNA'
   AND ptype.name = 'polypeptide'
   AND dbxref.accession = 'Sequencing Center'
   AND gtype.name = 'gene'
   AND db.name = 'GFF_source'
   AND feature.is_deleted = 0)

SELECT COUNT(curated.cgene_id) changed
FROM predicted,
  curated
WHERE predicted.cgene_id = curated.cgene_id
 AND curated.checksum != predicted.checksum

To gets also the gene name, run:

WITH curated AS
 (SELECT gacc.accession cgene_id, gene.name cgene_name,
    poly.md5checksum checksum
  FROM cgm_chado.feature JOIN feature_dbxref fxref ON fxref.feature_id = feature.feature_id 
  JOIN cgm_chado.dbxref ON dbxref.dbxref_id = fxref.dbxref_id 
  JOIN cgm_chado.db ON db.db_id = dbxref.db_id 
  JOIN cgm_chado.cvterm ON cvterm.cvterm_id = feature.TYPE_ID 
  JOIN cgm_chado.feature_relationship frel ON frel.subject_id = feature.feature_id 
  JOIN cgm_chado.feature gene ON gene.feature_id = frel.object_id
  JOIN cgm_chado.dbxref gacc ON gene.dbxref_id=gacc.dbxref_id 
  JOIN cgm_chado.cvterm gtype ON gtype.cvterm_id = gene.TYPE_ID 
  JOIN cgm_chado.feature_relationship frel2 ON frel2.object_id = feature.feature_id 
  JOIN cgm_chado.feature poly ON poly.feature_id = frel2.subject_id 
  JOIN cgm_chado.cvterm ptype ON ptype.cvterm_id = poly.TYPE_ID
  WHERE cvterm.name = 'mRNA'
  AND ptype.name = 'polypeptide'
  AND dbxref.accession = 'dictyBase Curator'
  AND gtype.name = 'gene'
  AND db.name = 'GFF_source'
  AND feature.is_deleted = 0),
 predicted AS
 (SELECT gacc.accession cgene_id,gene.name cgene_name,
    poly.md5checksum checksum
  FROM cgm_chado.feature JOIN feature_dbxref fxref ON fxref.feature_id = feature.feature_id 
  JOIN cgm_chado.dbxref ON dbxref.dbxref_id = fxref.dbxref_id 
  JOIN cgm_chado.db ON db.db_id = dbxref.db_id 
  JOIN cgm_chado.cvterm ON cvterm.cvterm_id = feature.TYPE_ID 
  JOIN cgm_chado.feature_relationship frel ON frel.subject_id = feature.feature_id 
  JOIN cgm_chado.feature gene ON gene.feature_id = frel.object_id
  JOIN cgm_chado.dbxref gacc ON gene.dbxref_id=gacc.dbxref_id 
  JOIN cgm_chado.cvterm gtype ON gtype.cvterm_id = gene.TYPE_ID 
  JOIN cgm_chado.feature_relationship frel2 ON frel2.object_id = feature.feature_id 
  JOIN cgm_chado.feature poly ON poly.feature_id = frel2.subject_id 
  JOIN cgm_chado.cvterm ptype ON ptype.cvterm_id = poly.TYPE_ID
  WHERE cvterm.name = 'mRNA'
  AND ptype.name = 'polypeptide'
  AND dbxref.accession = 'Sequencing Center'
  AND gtype.name = 'gene'
  AND db.name = 'GFF_source'
  AND feature.is_deleted = 0)

SELECT curated.cgene_id, curated.cgene_name
FROM predicted,
 curated
WHERE predicted.cgene_id = curated.cgene_id
AND curated.checksum != predicted.checksum
Personal tools