SQL Queries

From DictyWiki

(Difference between revisions)
Jump to: navigation, search
Revision as of 01:10, 24 November 2020 (edit)
Pfey@northwestern.edu (Talk | contribs)

← Previous diff
Revision as of 16:11, 22 April 2021 (edit)
Pfey@northwestern.edu (Talk | contribs)
(Strain Descriptor, DBS ID, Strain description with specific strain characteristic)
Next diff →
Line 562: Line 562:
<pre> <pre>
-select sc.strain_name, dx.accession, sc.strain_description +select strain_name, dx.accession, mutagenesis_method, strain_description
-from cgm_ddb.stock_center sc +from cgm_ddb.stock_center sc
inner join cgm_chado.dbxref dx on dx.dbxref_id = sc.dbxref_id inner join cgm_chado.dbxref dx on dx.dbxref_id = sc.dbxref_id
inner join cgm_ddb.strain_char_cvterm scc ON sc.id=scc.strain_id inner join cgm_ddb.strain_char_cvterm scc ON sc.id=scc.strain_id
inner join cgm_chado.cvterm ON cvterm.cvterm_id= scc.cvterm_id inner join cgm_chado.cvterm ON cvterm.cvterm_id= scc.cvterm_id
-where cvterm.name = 'null mutant'+and cvterm.name = 'CFP marked'
</pre> </pre>

Revision as of 16:11, 22 April 2021

Contents


Gene model curation stats


Total GO

Canonical query (excluding obsolete annotations)
SELECT COUNT(fcvt.feature_cvterm_id) annotations
FROM cgm_chado.feature_cvterm fcvt 
JOIN cgm_chado.feature gene ON gene.feature_id = fcvt.feature_id 
JOIN cgm_chado.cvterm type ON type.cvterm_id = gene.TYPE_ID 
JOIN cgm_chado.cvterm GO ON GO.cvterm_id = fcvt.cvterm_id 
JOIN cgm_chado.cv ON cv.cv_id = GO.cv_id
JOIN cgm_chado.organism ON organism.organism_id = gene.organism_id
WHERE  type.name = 'gene'
AND gene.is_deleted = 0
AND 
cv.name IN('molecular_function',   'biological_process',   'cellular_component')
AND GO.is_obsolete = 0
AND organism.common_name = 'dicty'
Including obsolete annotations
SELECT COUNT(fcvt.feature_cvterm_id) annotations
FROM cgm_chado.feature_cvterm fcvt 
JOIN cgm_chado.feature gene ON gene.feature_id = fcvt.feature_id 
JOIN cgm_chado.cvterm type ON type.cvterm_id = gene.TYPE_ID 
JOIN cgm_chado.cvterm GO ON GO.cvterm_id = fcvt.cvterm_id 
JOIN cgm_chado.cv ON cv.cv_id = GO.cv_id
JOIN cgm_chado.organism ON organism.organism_id = gene.organism_id
WHERE type.name = 'gene'
AND gene.is_deleted = 0
AND cv.name IN('molecular_function',   'biological_process',   'cellular_component')
AND organism.common_name = 'dicty'
Only obsolete annotations
SELECT COUNT(fcvt.feature_cvterm_id) obsolete_annotations
FROM cgm_chado.feature_cvterm fcvt 
JOIN cgm_chado.feature gene ON gene.feature_id = fcvt.feature_id 
JOIN cgm_chado.cvterm type ON type.cvterm_id = gene.TYPE_ID 
JOIN cgm_chado.cvterm GO ON GO.cvterm_id = fcvt.cvterm_id 
JOIN cgm_chado.organism ON organism.organism_id = gene.organism_id
JOIN cgm_chado.cv ON cv.cv_id = GO.cv_id
WHERE type.name = 'gene'
AND gene.is_deleted = 0
AND 
cv.name IN('molecular_function',   'biological_process',   'cellular_component')
AND
GO.is_obsolete = 1
AND organism.common_name = 'dicty'

Total GO by aspect

biological_process
SELECT COUNT(fcvt.feature_cvterm_id) process_annotations
FROM cgm_chado.feature_cvterm fcvt 
JOIN cgm_chado.feature gene ON gene.feature_id = fcvt.feature_id 
JOIN cgm_chado.cvterm type ON type.cvterm_id = gene.TYPE_ID 
JOIN cgm_chado.cvterm GO ON GO.cvterm_id = fcvt.cvterm_id 
JOIN cgm_chado.cv ON cv.cv_id = GO.cv_id
JOIN organism ON organism.organism_id = gene.organism_id
WHERE type.name = 'gene'
AND gene.is_deleted = 0
AND 
cv.name = 'biological_process'
AND GO.is_obsolete = 0
AND organism.common_name = 'dicty'
molecular_function
SELECT COUNT(fcvt.feature_cvterm_id) function_annotations
FROM cgm_chado.feature_cvterm fcvt 
JOIN cgm_chado.feature gene ON gene.feature_id = fcvt.feature_id 
JOIN cgm_chado.cvterm type ON type.cvterm_id = gene.TYPE_ID 
JOIN cgm_chado.cvterm GO ON GO.cvterm_id = fcvt.cvterm_id 
JOIN cgm_chado.cv ON cv.cv_id = GO.cv_id
JOIN organism ON organism.organism_id = gene.organism_id
WHERE type.name = 'gene'
AND gene.is_deleted = 0
AND 
cv.name = 'molecular_function'
AND GO.is_obsolete = 0
AND organism.common_name = 'dicty'
cellular_component
SELECT COUNT(fcvt.feature_cvterm_id) component_annotations
FROM cgm_chado.feature_cvterm fcvt 
JOIN cgm_chado.feature gene ON gene.feature_id = fcvt.feature_id 
JOIN cgm_chado.cvterm type ON type.cvterm_id = gene.TYPE_ID 
JOIN cgm_chado.cvterm GO ON GO.cvterm_id = fcvt.cvterm_id 
JOIN cgm_chado.cv ON cv.cv_id = GO.cv_id
JOIN organism ON organism.organism_id = gene.organism_id
WHERE type.name = 'gene'
AND gene.is_deleted = 0
AND 
cv.name = 'cellular_component'
AND GO.is_obsolete = 0
AND organism.common_name = 'dicty'

GO non IEA by aspect - FIXED WITH ECO CODE FEB2017 PF!

Canonical query(all aspects)
SELECT cv.name aspect,count(fcvt.feature_cvterm_id) annotations
FROM cgm_chado.feature_cvterm fcvt 
JOIN cgm_chado.feature gene ON gene.feature_id = fcvt.feature_id 
JOIN cgm_chado.cvterm type ON type.cvterm_id = gene.TYPE_ID 
JOIN cgm_chado.cvterm GO ON GO.cvterm_id = fcvt.cvterm_id 
JOIN cgm_chado.cv ON cv.cv_id = GO.cv_id
JOIN cgm_chado.feature_cvtermprop fcvt_prop ON fcvt_prop.feature_cvterm_id=fcvt.feature_cvterm_id
JOIN cgm_chado.cvterm evterm ON evterm.cvterm_id=fcvt_prop.type_id
JOIN cgm_chado.cv  ev ON ev.cv_id=evterm.cv_id
JOIN organism ON organism.organism_id = gene.organism_id
WHERE  type.name = 'gene'
ANd gene.is_deleted = 0
AND cv.name IN('molecular_function',   'biological_process',   'cellular_component')
AND GO.is_obsolete = 0
AND ev.name like 'evidence_code%'
AND evterm.name != 'automatic assertion'
AND organism.common_name = 'dicty'
GROUP BY cv.name
ORDER BY annotations DESC

GO by evidence code

Canonical query(all aspects)
SELECT evsyn.synonym_ evidence_code,count(fcvt.feature_cvterm_id) annotations
FROM cgm_chado.feature_cvterm fcvt 
JOIN cgm_chado.feature gene ON gene.feature_id = fcvt.feature_id 
JOIN cgm_chado.cvterm type ON type.cvterm_id = gene.TYPE_ID 
JOIN cgm_chado.cvterm GO ON GO.cvterm_id = fcvt.cvterm_id 
JOIN cgm_chado.cv ON cv.cv_id = GO.cv_id
JOIN cgm_chado.feature_cvtermprop fcvt_prop ON fcvt_prop.feature_cvterm_id=fcvt.feature_cvterm_id
JOIN cgm_chado.cvterm evterm ON evterm.cvterm_id=fcvt_prop.type_id
JOIN cgm_chado.cv  ev ON ev.cv_id=evterm.cv_id
JOIN cgm_chado.cvtermsynonym evsyn on evterm.cvterm_id=evsyn.cvterm_id
JOIN cgm_chado.cvterm synterm on synterm.cvterm_id=evsyn.type_id
JOIN organism ON organism.organism_id = gene.organism_id
WHERE type.name = 'gene'
AND gene.is_deleted = 0
AND organism.common_name = 'dicty'
AND GO.is_obsolete = 0
AND ev.name like 'evidence_code%'
AND synterm.name IN ('EXACT','RELATED', 'BROAD')
AND LENGTH(evsyn.synonym_) <= 3
AND 
cv.name IN('molecular_function',   'biological_process',   'cellular_component')
GROUP BY evsyn.synonym_
ORDER BY annotations DESC


molecular function only
SELECT evsyn.synonym_ evidence_code,count(fcvt.feature_cvterm_id) annotations
FROM cgm_chado.feature_cvterm fcvt 
JOIN cgm_chado.feature gene ON gene.feature_id = fcvt.feature_id 
JOIN cgm_chado.cvterm type ON type.cvterm_id = gene.TYPE_ID 
JOIN cgm_chado.cvterm GO ON GO.cvterm_id = fcvt.cvterm_id 
JOIN cgm_chado.cv ON cv.cv_id = GO.cv_id
JOIN cgm_chado.feature_cvtermprop fcvt_prop ON fcvt_prop.feature_cvterm_id=fcvt.feature_cvterm_id
JOIN cgm_chado.cvterm evterm ON evterm.cvterm_id=fcvt_prop.type_id
JOIN cgm_chado.cv  ev ON ev.cv_id=evterm.cv_id
JOIN cgm_chado.cvtermsynonym evsyn on evterm.cvterm_id=evsyn.cvterm_id
JOIN cgm_chado.cvterm synterm on synterm.cvterm_id=evsyn.type_id
JOIN organism ON organism.organism_id = gene.organism_id
WHERE type.name = 'gene'
AND gene.is_deleted = 0
AND organism.common_name = 'dicty'
AND GO.is_obsolete = 0
AND ev.name like 'evidence_code%'
AND synterm.name IN ('EXACT','RELATED', 'BROAD')
AND LENGTH(evsyn.synonym_) <= 3
AND 
cv.name = 'molecular_function'
GROUP BY evsyn.synonym_
ORDER BY annotations DESC
For other aspects replace biological_process (in cv.name) with either of biological_process or cellular_component

Genes with GO annotations

All genes with annotations
SELECT COUNT (DISTINCT fcvt.feature_id) gene_with_annotations
FROM CGM_CHADO.feature_cvterm fcvt
JOIN CGM_CHADO.feature_cvtermprop fcvt_prop ON fcvt_prop.feature_cvterm_id = fcvt.feature_cvterm_id
JOIN CGM_CHADO.cvterm evterm ON evterm.cvterm_id=fcvt_prop.type_id
JOIN CGM_CHADO.cv ev ON ev.cv_id=evterm.cv_id
JOIN CGM_CHADO.cvtermsynonym evsyn ON evterm.cvterm_id=evsyn.cvterm_id
JOIN CGM_CHADO.cvterm syn_type ON syn_type.cvterm_id = evsyn.type_id
JOIN CGM_CHADO.cv syn_cv ON syn_cv.cv_id = syn_type.cv_id
JOIN CGM_CHADO.feature gene ON gene.feature_id = fcvt.feature_id
JOIN CGM_CHADO.cvterm type ON type.cvterm_id = gene.type_id
JOIN organism organism ON organism.organism_id = gene.organism_id
WHERE gene.is_deleted = 0
AND type.name = 'gene'
AND ev.name like 'evidence_code%'
AND syn_type.name IN ('EXACT', 'RELATED', 'BROAD')
AND syn_cv.name = 'synonym_type'
AND organism.common_name = 'dicty'
All genes with non-IEA annotations
SELECT COUNT (DISTINCT fcvt.feature_id) gene_with_annotations
FROM CGM_CHADO.feature_cvterm fcvt
JOIN CGM_CHADO.feature_cvtermprop fcvt_prop ON fcvt_prop.feature_cvterm_id = fcvt.feature_cvterm_id
JOIN CGM_CHADO.cvterm evterm ON evterm.cvterm_id=fcvt_prop.type_id
JOIN CGM_CHADO.cv ev ON ev.cv_id=evterm.cv_id
JOIN CGM_CHADO.cvtermsynonym evsyn ON evterm.cvterm_id=evsyn.cvterm_id
JOIN CGM_CHADO.cvterm syn_type ON syn_type.cvterm_id = evsyn.type_id
JOIN CGM_CHADO.cv syn_cv ON syn_cv.cv_id = syn_type.cv_id
JOIN CGM_CHADO.feature gene ON gene.feature_id = fcvt.feature_id
JOIN CGM_CHADO.cvterm type ON type.cvterm_id = gene.type_id
JOIN organism organism ON organism.organism_id = gene.organism_id
WHERE gene.is_deleted = 0
AND type.name = 'gene'
AND ev.name like 'evidence_code%'
AND syn_type.name IN ('EXACT', 'RELATED', 'BROAD')
AND syn_cv.name = 'synonym_type'
AND organism.common_name = 'dicty'
AND evterm.name != 'automatic assertion'
All genes with EXP annotations
SELECT COUNT (DISTINCT fcvt.feature_id) gene_with_annotations
FROM CGM_CHADO.feature_cvterm fcvt
JOIN CGM_CHADO.feature_cvtermprop fcvt_prop ON fcvt_prop.feature_cvterm_id = fcvt.feature_cvterm_id
JOIN CGM_CHADO.cvterm evterm ON evterm.cvterm_id=fcvt_prop.type_id
JOIN CGM_CHADO.cv ev ON ev.cv_id=evterm.cv_id
JOIN CGM_CHADO.cvtermsynonym evsyn ON evterm.cvterm_id=evsyn.cvterm_id
JOIN CGM_CHADO.cvterm syn_type ON syn_type.cvterm_id = evsyn.type_id
JOIN CGM_CHADO.cv syn_cv ON syn_cv.cv_id = syn_type.cv_id
JOIN CGM_CHADO.feature gene ON gene.feature_id = fcvt.feature_id
JOIN CGM_CHADO.cvterm type ON type.cvterm_id = gene.type_id
JOIN organism organism ON organism.organism_id = gene.organism_id
WHERE gene.is_deleted = 0
AND type.name = 'gene'
AND ev.name like 'evidence_code%'
AND syn_type.name IN ('EXACT', 'RELATED', 'BROAD')
AND syn_cv.name = 'synonym_type'
AND organism.common_name = 'dicty'
AND evsyn.synonym_ IN ('direct assay evidence used in manual assertion', 'IDA', '(mutant phenotype evidence used in manual assertion', 'IMP', 'physical interaction evidence', 'IPI', 
'genetic interaction evidence used in manual assertion', 'IGI', 'expression pattern evidence', 'IEP');
To get the gene list, replace the first SELECT line with 'SELECT DISTINCT gene.uniquename gene_id' (no quotes).

New literatures by date

To retrieve pubmed,title,date
SELECT uniquename pubmed,title,created_at from cgm_chado.pub
WHERE 
created_at >= to_date('02/08/2011','MM/DD/YYYY')
AND 
created_at <= to_date('02/15/2011','MM/DD/YYYY')
AND  pubplace = 'PUBMED' 
ORDER BY pub_id
To retrieve only pubmed id
SELECT pub.uniquename pubmed from cgm_chado.pub 
WHERE 
created_at >= to_date('02/08/2011','MM/DD/YYYY')
AND 
created_at <= to_date('02/15/2011','MM/DD/YYYY')
AND  pubplace = 'PUBMED' 
ORDER BY pub_id desc
To retrieve all columns(kind of less useful)
SELECT * from cgm_chado.pub
WHERE 
created_at >= to_date('02/08/2011','MM/DD/YYYY')
AND 
created_at <= to_date('02/15/2011','MM/DD/YYYY')
AND  pubplace = 'PUBMED' 
ORDER BY pub_id

All PubMed Literature in database

SELECT * 
  FROM cgm_chado.pub 
 WHERE pubplace = 'PUBMED'
 ORDER BY UPDATED_AT

Literature, all Annotations, Curator, from Distinct Reference

By reference no
  • Remember to replace 16444 with any other number
SELECT feat.name locus, fpub.pub_id reference_no,pubterm.name literature_topic,
curator.name who_did_it,cpubprop.timecreated when
FROM cgm_chado.feature_pub fpub
JOIN cgm_chado.feature_pubprop fpubprop ON fpubprop.feature_pub_id=fpub.feature_pub_id
JOIN cgm_chado.cvterm pubterm ON pubterm.cvterm_id=fpubprop.type_id
JOIN cgm_chado.cv pubcv ON pubcv.cv_id=pubterm.cv_id
JOIN 
cgm_chado.curator_feature_pubprop cpubprop ON cpubprop.feature_pubprop_id=fpubprop.feature_pubprop_id
JOIN cgm_chado.curator ON curator.curator_id=cpubprop.curator_id
JOIN cgm_chado.feature feat ON feat.feature_id = fpub.feature_id
JOIN cgm_chado.organism ON organism.organism_id = feat.organism_id
WHERE pubcv.name = 'dictyBase_literature_topic'
AND organism.common_name = 'dicty'
AND fpub.pub_id = 16444
By curator
SELECT DISTINCT c.name who_did_it
FROM cgm_chado.pub p
INNER JOIN cgm_chado.feature_pub fp               ON p.pub_id = fp.pub_id
INNER JOIN cgm_chado.feature_pubprop fpp          ON fpp.feature_pub_id = fp.feature_pub_id
INNER JOIN cgm_chado.curator_feature_pubprop cfpp ON cfpp.feature_pubprop_id = fpp.feature_pubprop_id
INNER JOIN cgm_chado.curator c                    ON c.curator_id = cfpp.curator_id
WHERE p.pub_id  = 12399

All lit topics associated with gene names and papers

  • This is ordered by lit topic but you could also ORDER by reference_no
SELECT feat.name locus, fpub.pub_id reference_no,pubterm.name literature_topic,
curator.name who_did_it,cpubprop.timecreated when
FROM cgm_chado.feature_pub fpub
JOIN cgm_chado.feature_pubprop fpubprop ON fpubprop.feature_pub_id=fpub.feature_pub_id
JOIN cgm_chado.cvterm pubterm ON pubterm.cvterm_id=fpubprop.type_id
JOIN cgm_chado.cv pubcv ON pubcv.cv_id=pubterm.cv_id
JOIN 
cgm_chado.curator_feature_pubprop cpubprop ON cpubprop.feature_pubprop_id=fpubprop.feature_pubprop_id
JOIN cgm_chado.curator ON curator.curator_id=cpubprop.curator_id
JOIN cgm_chado.feature feat ON feat.feature_id = fpub.feature_id
JOIN cgm_chado.organism ON organism.organism_id = feat.organism_id
WHERE pubcv.name = 'dictyBase_literature_topic'
AND organism.common_name = 'dicty'
ORDER by literature_topic

Find all lit topics associated with each gene (Gene ID) by paper

 
SELECT dx.accession, p.uniquename, stragg( ct.name ) AS Literature_Topics
FROM cgm_chado.v_gene_features g
INNER JOIN cgm_chado.dbxref dx           ON g.dbxref_id = dx.dbxref_id
INNER JOIN cgm_chado.feature_pub fp      ON g.feature_id = fp.feature_id
INNER JOIN cgm_chado.pub p               ON fp.pub_id = p.pub_id
INNER JOIN cgm_chado.feature_pubprop fpp ON fp.feature_pub_id = fpp.feature_pub_id
INNER JOIN cgm_chado.cvterm ct           ON fpp.type_id = ct.cvterm_id
INNER JOIN cgm_chado.cv                  ON ct.cv_id = cv.cv_id
INNER JOIN cgm_chado.organism o          ON o.organism_id = g.organism_id
WHERE o.common_name = 'dicty'
AND p.pubplace = 'PUBMED'
AND cv.name ='dictyBase_literature_topic'  
GROUP BY dx.accession, p.uniquename
ORDER BY to_number(p.uniquename) 

Notes for Genes containing word string, by curator

To search for both public and private notes use: 'AND ct.name in ('public note' , 'private note')'; or replace one with other. In this example notes containing UTR are searched. This orders by curator, but can be changed also.

SELECT g.name, fp.value, fp.created_by, ct.name
FROM cgm_chado.featureprop fp
INNER JOIN cgm_chado.v_gene_features g   ON g.feature_id = fp.feature_id
INNER JOIN cgm_chado.cvterm ct           ON fp.type_id = ct.cvterm_id
INNER JOIN cgm_chado.organism o          ON o.organism_id    = g.organism_id
WHERE o.common_name = 'dicty'
AND ct.name ='public note'
AND value LIKE '%UTR%'
ORDER BY fp.created_by

Convert gene names to DDB_G IDs

SELECT g.name  AS GENE_NAME, dx.accession AS DDB_G_ID
FROM cgm_chado.v_gene_features g
INNER JOIN cgm_chado.dbxref dx           ON g.dbxref_id = dx.dbxref_id
INNER JOIN cgm_chado.organism o          ON o.organism_id    = g.organism_id
WHERE o.common_name = 'dicty'
AND g.name IN ('rab14', 'rab11A' , 'rab21' )

Genes, Primary IDs from Genes with specific Gene Product

Substitute any gene product for 'TRANSCRIPT%'; the % can be omitted when gene product is distinct.

      SELECT G.GENE_NAME, G.PRIMARY_FEATURE_DICTYBASEID
        FROM CGM_DDB.GENE_PRODUCT GP
  INNER JOIN CGM_DDB.LOCUS_GP LGP
          ON LGP.GENE_PRODUCT_NO = GP.GENE_PRODUCT_NO
  INNER JOIN CGM_CHADO.V_GENE_DICTYBASEID G
          ON LGP.LOCUS_NO = G.GENE_FEATURE_ID
       WHERE UPPER(GP.GENE_PRODUCT) LIKE UPPER('TRANSCRIPT%')
    GROUP BY G.GENE_NAME, G.PRIMARY_FEATURE_DICTYBASEID;

Genes, Primary IDs from Genes with automated Gene Product

 
      SELECT G.GENE_NAME, G.PRIMARY_FEATURE_DICTYBASEID, GP.GENE_PRODUCT
        FROM CGM_DDB.GENE_PRODUCT GP
  INNER JOIN CGM_DDB.LOCUS_GP LGP
          ON LGP.GENE_PRODUCT_NO = GP.GENE_PRODUCT_NO
  INNER JOIN CGM_CHADO.V_GENE_DICTYBASEID G
          ON LGP.LOCUS_NO = G.GENE_FEATURE_ID
       WHERE GP.IS_AUTOMATED ='1'
    order by GP.GENE_PRODUCT

Pseudogene IDs

SELECT feature.uniquename
FROM CGM_CHADO.feature
INNER JOIN CGM_CHADO.cvterm ON cvterm.cvterm_id = feature.TYPE_ID
WHERE cvterm.name = 'pseudogene'
and feature.is_deleted = 0

To get also the Gene Name

SELECT pseudofeat.uniquename pseudogene,genefeat.name gene
FROM CGM_CHADO.v_notdeleted_feature pseudofeat
inner join CGM_CHADO.feature_relationship frel on frel.subject_id=pseudofeat.feature_id
inner join CGM_CHADO.v_notdeleted_feature genefeat on frel.object_id=genefeat.feature_id
inner join CGM_CHADO.cvterm on cvterm.cvterm_id=pseudofeat.type_id
WHERE cvterm.name = 'pseudogene'


Map all primary DDB to GenBank Protein Accession

 select d.accession, f.name, db.name
              from CGM_CHADO.v_primary_feature f
        inner join CGM_CHADO.feature_dbxref fd
                on fd.feature_id = f.feature_id
        inner join CGM_CHADO.dbxref d
                on d.dbxref_id = fd.dbxref_id
        inner join CGM_CHADO.db
                on d.db_id = db.db_id
               and db.name = 'DB:Protein Accession Number';

Genes, DDB IDs with Specific GO terms, created_by

Replace GO ID with any other; delete the 'and created_by..' if not restricted to curator; add 'order by..' after parenthesis to sort by gene_name etc.

SELECT * from cgm_chado.v_gene_dictybaseid 
WHERE gene_feature_id in
	(SELECT distinct locus_no from go_locus_goev WHERE goid = 6118 and created_by='CGM_DDB_PFEY')


Summary paragraphs with gene names


SELECT feature.uniquename,
       p.paragraph_text 
  FROM cgm_chado.featureprop fp 
  JOIN cgm_ddb.paragraph p on TO_NCHAR(fp.value)= p.paragraph_no 
  JOIN cgm_chado.feature ON feature.feature_id=fp.feature_id 
 WHERE fp.type_id=984 
   AND feature.is_deleted=0 
   AND paragraph_text not like '

add at end optional: order by date_created
if all, including status notes is desired, delete: and paragraph_text not like '<summary%><curation_status>%'

Summary by Text string including Curation Status

SELECT p.created_by, FEATURE_ID as n
FROM cgm_chado.featureprop fp
JOIN cgm_ddb.paragraph p on TO_NCHAR(fp.value)= p.paragraph_no
WHERE fp.type_id=984
AND paragraph_text like '%Basic%'

Number of PM references with GO and PHenotype annotations

SELECT TAB_NAME, PRIMARY_KEY, reference_no FROM reflink WHERE reference_no in ( SELECT reference_no FROM reference WHERE pubmed is not null )

Table must be cleaned up: In excel do "convert data to columns" and distangle the PRIMARY_KEY fields. Then put PMIDs in one column and sort, then delete duplicate PMID refs.

All phenotypes

SELECT d.PRIMARY_FEATURE_DICTYBASEID as DICTYBASEID,
d.gene_name, 
       cv.name  
  FROM cgm_chado.feature_genotype fg 
  JOIN cgm_chado.v_gene_dictybaseid d on fg.feature_id = d.GENE_FEATURE_ID  
  JOIN cgm_chado.phenstatement ph on fg.genotype_id = ph.genotype_id 
  JOIN cgm_chado.phenotype p on ph.phenotype_id= p.phenotype_id 
  JOIN cgm_chado.cvterm cv on p.observable_id=cv.cvterm_id 
left outer JOIN cgm_chado.dbxref xr on cv.dbxref_id = xr.dbxref_id
where xr.accession like 'DDPHENO%'

Strain Descriptor, DBS ID within a period

select strain_name, dx.accession from cgm_ddb.stock_center sc 
inner join cgm_chado.dbxref dx on dx.dbxref_id = sc.dbxref_id 
where sc.date_created BETWEEN to_date('01-FEB-2011') AND to_date('31-JAN-2012') 

Different Species available in the SC

select count(sc.systematic_name) num_of_strains, sc.species from cgm_ddb.stock_center sc 
where exists (select 'a' from cgm_ddb.stock_center_inventory sci where sc.id = sci.strain_id)
group by sc.species
order by sc.species DESC


Strain Descriptor, DBS ID, available in the SC

select strain_name, dx.accession from cgm_ddb.stock_center sc 
inner join cgm_chado.dbxref dx on dx.dbxref_id = sc.dbxref_id 
where exists (select 'a' from cgm_ddb.stock_center_inventory sci where sc.id = sci.strain_id) 

Add "AND sc.date_created BETWEEN to_date('01-FEB-2011') AND to_date('31-JAN-2012')" No quotes and change Date as appropriate.

Strain Descriptor, DBS ID, specific mutagenesis method, available in the SC

select strain_name, dx.accession, mutagenesis_method  from cgm_ddb.stock_center sc inner join cgm_chado.dbxref dx 
on dx.dbxref_id = sc.dbxref_id where exists (select 'a' from cgm_ddb.stock_center_inventory sci where sc.id = sci.strain_id) 
and mutagenesis_method = 'REMI'

You can specify other mutagenesis methods

Strain Descriptor, DBS ID, Strain description with specific strain characteristic

select strain_name, dx.accession, mutagenesis_method, strain_description  
from cgm_ddb.stock_center sc
inner join cgm_chado.dbxref dx on dx.dbxref_id = sc.dbxref_id 
inner join cgm_ddb.strain_char_cvterm scc ON sc.id=scc.strain_id
inner join cgm_chado.cvterm ON cvterm.cvterm_id= scc.cvterm_id
and cvterm.name = 'CFP marked'

You can specify other characteristic

Strain Descriptor, DBS ID, Mutagenesis Method, Strain description with specific strain characteristic available in the SC

select sc.strain_name, dx.accession, mutagenesis_method, sc.strain_description  
from cgm_ddb.stock_center sc 
inner join cgm_chado.dbxref dx on dx.dbxref_id = sc.dbxref_id 
inner join cgm_ddb.strain_char_cvterm scc ON sc.id=scc.strain_id
inner join cgm_chado.cvterm ON cvterm.cvterm_id= scc.cvterm_id
where exists 
(select 'a' from cgm_ddb.stock_center_inventory sci where sc.id = sci.strain_id) 
and cvterm.name = 'GFP marked'

You can specify other characteristic

Strain Names, Systematic Names, Strain Description where strain name starts with a specific string (example 'JS')

SELECT strain_name, systematic_name, strain_description FROM stock_center WHERE strain_name like 'JS%' or 
systematic_name like 'JS%'

Can also search for specific strains by using 'equals'

SELECT strain_name, systematic_name, strain_description FROM cgm_ddb.stock_center WHERE strain_name = 'JS65' or 
systematic_name = 'JS65'

Strain Names, Synonyms, Systematic Names, Created_by, Strain Description

SELECT strain_name, stragg( name) as synonyms, systematic_name, created_by, strain_description FROM cgm_ddb.stock_center  sc
JOIN cgm_ddb.STRAIN_SYNONYM ss on ss.strain_id = sc.id
JOIN cgm_chado.synonym_ cs on ss.synonym_id= cs.synonym_id 
GROUP BY strain_name, created_by, strain_description, systematic_name
ORDER BY created_by, strain_name

Can also restrict the curator by adding

where created_by ='CGM_DDB_PASC' 

before the GROUP BY statement

Strain Names, Created_by, Strain Description, Strain Characteristics

  SELECT strain_name,   
         systematic_name,
         strain_description,
         stragg( name) as Strain_characteristics
     FROM cgm_ddb.stock_center s 
     join cgm_ddb.STRAIN_CHAR_CVTERM sc on s.id = sc.strain_id
     join cgm_chado.CVTERM cv on sc.CVTERM_ID = cv.CVTERM_ID
    where strain_description like '%null%'
    group by strain_name,  systematic_name, strain_description

Strain Names, Created_by, by a specific plasmid


SELECT strain_name,
       plasmid,
       mutagenesis_method,
       created_by 
  FROM cgm_ddb.stock_center 
 WHERE plasmid like '%pEXP%' or plasmid like '%pExp%'
 ORDER BY created_by

Strain Names, Systematic Name, Strain Description, by a specific Strain Characteristic

SELECT strain_name,
       systematic_name,
       name,
       strain_description 
  FROM stock_center s 
  JOIN STRAIN_CHAR_CVTERM sc on s.id = sc.strain_id 
  JOIN cgm_chado.CVTERM cv on sc.CVTERM_ID = cv.CVTERM_ID 
 WHERE name ='multiple null mutant'
 ORDER BY strain_name

Strains annotated with a specific environment condition

  SELECT strain_name, systematic_name, name as envt     
  FROM cgm_ddb.stock_center s    
  JOIN cgm_chado.phenstatement ps  ON s.genotype_id = ps.genotype_id 
  JOIN cgm_chado.cvterm cv ON cv.CVTERM_ID = ps.ENVIRONMENT_ID
  WHERE name ='in suspension'

Strains annotated with a specific assay

SELECT strain_name, systematic_name, name as assay       
FROM cgm_ddb.stock_center s    
JOIN cgm_chado.phenstatement ph on s.genotype_id = ph.genotype_id 
JOIN cgm_chado.phenotype p ON ph.phenotype_id = p.phenotype_id
JOIN cgm_chado.cvterm cv ON cv.CVTERM_ID = p.assay_ID
WHERE name ='electron microscopy'

Stock Center Orders by name

THIS DOES NOT WORK - maybe View is gone?
Substitute any name or use 'last_name' and order by anything, here by order ID.

SELECT * from cgm_ddb.stock_order_view WHERE first_name = 'Baskar' ORDER BY Order_ID

Total stock center orders within a period

SELECT COUNT(stock_order_id) num_of_order
FROM cgm_ddb.stock_order
WHERE order_date BETWEEN to_date('01-AUG-10')
 AND to_date('31-JUL-11')


Replace the two to_date parameters for your date of choice

SELECT stock_order_id 
FROM cgm_ddb.stock_order
WHERE order_date BETWEEN to_date('08-JUL-15')
 AND to_date('16-JUL-15')

Use this to get the list of order numbers with the selected period

Total strain orders within a period

SELECT COUNT(distinct sorder.stock_order_id) num_of_strains
 FROM cgm_ddb.stock_center sc 
JOIN cgm_ddb.stock_item_order sitem ON sc.id = sitem.item_id 
JOIN cgm_ddb.stock_order sorder ON sorder.stock_order_id = sitem.order_id
WHERE sorder.order_date BETWEEN to_date('01-AUG-10')
 AND to_date('31-JUL-11')

Total number of STRAINS ordered within a period

SELECT COUNT(sorder.stock_order_id) num_of_strains
 FROM cgm_ddb.stock_center sc 
JOIN cgm_ddb.stock_item_order sitem ON sc.id = sitem.item_id 
JOIN cgm_ddb.stock_order sorder ON sorder.stock_order_id = sitem.order_id
WHERE sorder.order_date BETWEEN to_date('01-FEB-12')
 AND to_date('31-JAN-13')

Number EACH STRAIN has been ordered within a period with DBS ID, strain name

SELECT dx.accession, sc.strain_name, COUNT(sio.order_id) AS NUM_ORDERS
FROM cgm_ddb.stock_center sc
JOIN cgm_chado.dbxref dx on dx.dbxref_id = sc.dbxref_id 
JOIN CGM_DDB.stock_item_order sio ON sc.id = sio.item_id
JOIN cgm_ddb.strain_char_cvterm scc ON sc.id=scc.strain_id
JOIN cgm_chado.cvterm ON cvterm.cvterm_id= scc.cvterm_id
JOIN cgm_ddb.stock_order sorder ON sorder.stock_order_id = sio.order_id
WHERE sorder.order_date BETWEEN to_date('07-JUN-10') AND to_date('29-AUG-12')
GROUP BY dx.accession, sc.strain_name
ORDER BY NUM_ORDERS DESC;

Replace date, change ORDER BY, or ASC instead of DESC; restrict search for specific mutant by adding "WHERE cvterm.name = 'null mutant' " after the last JOIN and changing the next WHERE to AND; restrict to specific strain name by adding AND sc.strain_name = 'AX3' (choose strain name) after the order date line.

No of plasmids added within a period

select plasmid.name,plasmid.description FROM plasmid
where plasmid.date_created BETWEEN to_date('01-FEB-2011') AND to_date('31-JAN-2012')

The above query does not work!

Total number of PLASMIDS ordered within a period

SELECT COUNT(sorder.stock_order_id) num_of_plasmids
 FROM cgm_ddb.plasmid sc 
JOIN cgm_ddb.stock_item_order sitem ON sc.id = sitem.item_id 
JOIN cgm_ddb.stock_order sorder ON sorder.stock_order_id = sitem.order_id
WHERE sorder.order_date BETWEEN to_date('01-FEB-12')
 AND to_date('31-JAN-13')

Total plasmid orders within a period


SELECT COUNT(distinct sorder.stock_order_id) num_of_plasmids
 FROM cgm_ddb.plasmid sc 
JOIN cgm_ddb.stock_item_order sitem ON sc.id = sitem.item_id 
JOIN cgm_ddb.stock_order sorder ON sorder.stock_order_id = sitem.order_id
WHERE sorder.order_date BETWEEN to_date('01-AUG-10')
 AND to_date('31-JUL-11')

Distinct Stock Center users within a period

SELECT COUNT(distinct colleague_no) num_of_order
FROM cgm_ddb.stock_order_view
WHERE order_date BETWEEN to_date('01-FEB-11')
 AND to_date('31-JAN-12')

Stock center orders by countries


SELECT COUNT(so.stock_order_id) num_of_orders, coll.country country
FROM cgm_ddb.stock_order so
JOIN cgm_ddb.colleague coll ON coll.colleague_no=so.colleague_id
WHERE so.order_date BETWEEN to_date('01-AUG-10')
 AND to_date('31-JUL-11')
GROUP BY coll.country
ORDER BY num_of_orders DESC

Most, least and avg no of items per stock center order


WITH item_counts AS
  (SELECT COUNT(colleague_no) items,
     order_id
   FROM stock_order_view
   WHERE order_date BETWEEN to_date('01-AUG-10')
   AND to_date('31-JUL-11')
   GROUP BY order_id)

SELECT MAX(items) max_items,
  MIN(items) min_items,
  ROUND(AVG(items)) avg_items
FROM item_counts


Order rows in a resultset

select name,uniquename,seqlen,residues,created_by from cgm_chado.feature order by timelastmodified asc|desc

asc => should sort the result by ascending order
desc => should sort the result by descending order

No of Colleagues and more

select count(COLLEAGUE_NO) ccount from cgm_ddb.COLLEAGUE;
select  LAST_NAME, DATE_MODIFIED, COUNTRY FROM cgm_ddb.COLLEAGUE
 ORDER BY COUNTRY

This query gives the last name, date modified, and country of all subscribed colleagues.

Select feature types by dictyBase ID

select ncrna.uniquename FROM cgm_chado.feature ncrna join
cgm_chado.cvterm nctype on nctype.cvterm_id=ncrna.type_id
where nctype.name like '%snoRNA%' OR
nctype.name = 'ncRNA'
or nctype.name like 'class%RNA' or nctype.name = 'rRNA' or
nctype.name = 'snRNA' OR nctype.name LIKE 'RNase%'
OR nctype.name LIKE 'SRP%'

Select Feature types (these are all ncRNAs including tRNAs) by DDB ID with gene name, gene products (bundled into one field by comma).

select gene.name, stragg (gp.gene_product) as gene_product, ncrna.uniquename FROM cgm_chado.feature ncrna join
cgm_chado.cvterm nctype on nctype.cvterm_id=ncrna.type_id
INNER JOIN cgm_chado.feature_relationship fr
ON ncrna.feature_id = fr.subject_id
INNER JOIN cgm_chado.cvterm rcvt
ON rcvt.cvterm_id = fr.type_id
INNER JOIN cgm_chado.feature gene
ON gene.feature_id = fr.object_id
INNER JOIN cgm_ddb.locus_gp lg
ON lg.locus_no = gene.feature_id
INNER JOIN cgm_ddb.gene_product gp
ON lg.gene_product_no = gp.gene_product_no
where nctype.name like '%snoRNA%' OR
nctype.name = 'ncRNA' OR nctype.name = 'tRNA'
or nctype.name like 'class%RNA' or nctype.name = 'rRNA' or
nctype.name = 'snRNA' OR nctype.name LIKE 'RNase%'
OR nctype.name LIKE 'SRP%' group by gene.name, ncrna.uniquename

Select gene name, gene synonym and protein synonym, DDB_G ID

You can order by gsyn instead of gene.name, or both separated by a comma; this query excludes pseudogenes:

WITH dicty_coding_genes AS (
SELECT dbxref.accession gene_id
FROM cgm_chado.feature gene
JOIN organism ON organism.organism_id=gene.organism_id
JOIN dbxref on dbxref.dbxref_id=gene.dbxref_id
JOIN cgm_chado.cvterm gtype on gtype.cvterm_id=gene.type_id
JOIN cgm_chado.feature_relationship frel ON frel.object_id=gene.feature_id
JOIN cgm_chado.feature mrna ON frel.subject_id=mrna.feature_id
JOIN cgm_chado.cvterm mtype ON mtype.cvterm_id=mrna.type_id
WHERE gtype.name='gene'
AND mtype.name='mRNA'
AND organism.common_name = 'dicty'
AND gene.name NOT LIKE '%\_ps%'
ESCAPE '\'
group by dbxref.accession
)

SELECT gene.name,wm_concat(syn.name) gsyn,dg.gene_id gene_id
FROM cgm_chado.feature gene
JOIN cgm_chado.dbxref on gene.dbxref_id=dbxref.dbxref_id
JOIN dicty_coding_genes dg on dg.gene_id=dbxref.accession
LEFT JOIN cgm_chado.feature_synonym fsyn on gene.feature_id=fsyn.feature_id
LEFT JOIN cgm_chado.synonym_ syn on syn.synonym_id=fsyn.SYNONYM_ID
group by gene.name,dg.gene_id
order by gene.name DESC

To include pseudogenes:

WITH dicty_coding_genes AS (
SELECT dbxref.accession gene_id
FROM cgm_chado.feature gene
JOIN organism ON organism.organism_id=gene.organism_id
JOIN dbxref on dbxref.dbxref_id=gene.dbxref_id
JOIN cgm_chado.cvterm gtype on gtype.cvterm_id=gene.type_id
JOIN cgm_chado.feature_relationship frel ON frel.object_id=gene.feature_id
JOIN cgm_chado.feature mrna ON frel.subject_id=mrna.feature_id
JOIN cgm_chado.cvterm mtype ON mtype.cvterm_id=mrna.type_id
WHERE gtype.name='gene'
AND mtype.name='mRNA'
AND organism.common_name = 'dicty'

group by dbxref.accession
)

SELECT gene.name,wm_concat(syn.name) gsyn,dg.gene_id gene_id
FROM cgm_chado.feature gene
JOIN cgm_chado.dbxref on gene.dbxref_id=dbxref.dbxref_id
JOIN dicty_coding_genes dg on dg.gene_id=dbxref.accession
LEFT JOIN cgm_chado.feature_synonym fsyn on gene.feature_id=fsyn.feature_id
LEFT JOIN cgm_chado.synonym_ syn on syn.synonym_id=fsyn.SYNONYM_ID
group by gene.name,dg.gene_id
order by gene.name DESC

Find genes by gene name common root, e.g. _RTE

change 'gene.is_deleted = 0' to gene.is_deleted = 1' to get the deleted genes; works for _TE, or _ps% (% wildcard to get ps1,2 etc).

WITH dicty_genes AS (
SELECT dbxref.accession gene_id,gene.name gene_name
FROM cgm_chado.feature gene
JOIN cgm_chado.organism ON organism.organism_id=gene.organism_id
JOIN cgm_chado.dbxref ON dbxref.dbxref_id=gene.dbxref_id
JOIN cgm_chado.cvterm gtype ON gtype.cvterm_id=gene.type_id
WHERE gtype.name='gene'
AND organism.common_name = 'dicty'
AND gene.is_deleted = 0
)

SELECT gene_id,gene_name FROM dicty_genes
WHERE gene_name LIKE '%\_RTE'
ESCAPE '\'

Find feature types of genes by ID

Add as many IDs as desired.

SELECT dx.accession, stragg(ct.name) as models
FROM cgm_chado.v_gene_features g
INNER JOIN cgm_chado.feature_relationship fr ON g.feature_id = fr.object_id
INNER JOIN cgm_chado.feature f     ON fr.subject_id = f.feature_id
INNER JOIN cgm_chado.cvterm ct on f.type_id = ct.cvterm_id
inner join cgm_chado.dbxref dx on dx.dbxref_id = g.dbxref_id
where dx.accession in (
'DDB_G0271556',
'DDB_G0278875',
'DDB_G0305569'
) group by dx.accession

Genes with community annotations

SELECT f.name 
  FROM cgm_chado.feature f 
  JOIN cgm_chado.FEATUREPROP fp on f.feature_id = fp.feature_id 
 WHERE fp.value like 'Has Wiki Page'
 ORDER BY f.name

Genes with community annotations on Gene Page (description)


SELECT f.name,
       p.value as descriptions 
  FROM cgm_chado.featureprop p inner 
  JOIN cgm_chado.v_gene_features l on l.feature_id=p.feature_id 
  JOIN cgm_chado.feature f on f.feature_id = p.feature_id 
 WHERE p.type_id=972 
   AND p.feature_id in ( SELECT feature_id 
	  FROM cgm_chado.FEATUREPROP 
	 WHERE value like 'Has Wiki Page'  )





Return to [Curation To Do]

Personal tools