-------------------------------------------------------------------
-- chado_demo-sample-queries.sql
--
-- This document contains sample SQL queries for the Sybil demo 
-- database, chado_demo, which is available for download at 
-- http://sybil.sourceforge.net/downloads.html  These queries
-- use PostgreSQL syntax.
-- 
-- Not all Sybil chado databases look exactly like the chado_demo 
-- database, so this document also includes a number of notes 
-- about the differences between chado_demo and some of the newer
-- chado databases created for use with Sybil (and also, in some
-- cases, between chado_demo and non-Sybil chado databases.)  These 
-- notes are all prefaced by the tag "*CHADO USAGE NOTE" and can
-- be ignored by those only interested in querying chado_demo.
--
-- The tag "NOTE: slow query" is used to indicate queries that are 
-- inordinately slow (using a stock PostgreSQL 8.0.3 gentoo 
-- install on a Dell Inspiron 8600 laptop.)
-------------------------------------------------------------------

psql chado_demo

-- List the comparative analyses that have been loaded into chado_demo:
SELECT analysis_id, program, timeexecuted FROM analysis;

-- analysis_id | program |    timeexecuted     
-- -------------+---------+---------------------
--           1 | blastp  | 2004-12-02 04:22:00
--           2 | cogs    | 2004-12-02 04:22:00
--           3 | jaccard | 2004-12-02 04:22:00
--           4 | jaccard | 2004-12-02 04:22:00
--           5 | jaccard | 2004-12-02 04:22:00
-- (5 rows)

-- blastp  : an all-vs-all WU-BLASTP analysis run on the polypeptide
--           sequence (feature.residues) of every chado feature of 
--           type 'protein'
-- cogs    : Sybil COGs analysis
-- jaccard : Sybil Jaccard cluster analyses
--
-- See http://sybil.sourceforge.net/documentation.html#algorithms for 
-- more information on the above algorithms.

-- *CHADO USAGE NOTE: In other Sybil databases the three Jaccard
-- cluster analyses shown above could be loaded as a single analysis
-- instead of as three separate analyses.  Since there are three
-- distinct genomes/organisms in chado_demo, a Jaccard cluster
-- analysis was run on each of them.  However, more recent versions 
-- of the software are capable of combining multiple Jaccard cluster 
-- runs into a single workflow/chado analysis.

-- List the genomes/organisms that have been loaded into chado_demo:
SELECT organism_id, genus, species FROM organism;

--  organism_id |      genus      |  species   
-- -------------+-----------------+------------
--            1 | cog             | cog
--            2 | Plasmodium      | falciparum
--            3 | Plasmodium      | yoelii
--            4 | Cryptosporidium | parvum
-- (4 rows)

-- In chado the feature.organism_id column is not nullable.  Therefore
-- in Sybil a "dummy" organism is created (the one with genus and
-- species = "cog" in the above list), whenever it is necessary to
-- create features whose source organism is either unknown or not
-- well-defined.  By convention the Sybil Jaccard cluster and COG
-- protein cluster features are always assigned this dummy organism, 
-- even for those clusters/COGs that contain proteins from only one
-- organism/genome.  The following queries illustrate this point:

SELECT DISTINCT a.program, cf.organism_id
FROM analysis a, analysisfeature af, feature cf
WHERE a.program = 'cogs'
AND a.analysis_id = af.analysis_id
AND af.feature_id = cf.feature_id;
-- NOTE: slow query

--  program | organism_id 
-- ---------+-------------
--  cogs    |           1
-- (1 row)

-- All Sybil COG features are assigned organism_id = 1 (dummy organism_id)

SELECT DISTINCT a.program, cf.organism_id
FROM analysis a, analysisfeature af, feature cf
WHERE a.program = 'jaccard'
AND a.analysis_id = af.analysis_id
AND af.feature_id = cf.feature_id;
-- NOTE: slow query

--  program | organism_id 
-- ---------+-------------
--  jaccard |           1
-- (1 row)

-- All Sybil Jaccard cluster features are assigned organism_id = 1 (dummy organism_id)

-- *CHADO USAGE NOTE: In more recent Sybil databases the word "cog" in
-- the dummy organism entry has been replaced by "not known" or
-- "unknown".  The genus and species columns in the organism table are 
-- not nullable, so some value must be supplied for both.

-- The 'jaccard' and 'cogs' analyses are both protein clustering
-- analyses, meaning that their input is a set of proteins and their
-- output is a set of protein clusters or groups.  The inputs of these
-- analyses cannot be queried directly from chado, although it is
-- possible to glean some information by looking at the analysisprop
-- table.  For example, the Jaccard cluster analyses in chado_demo use
-- an analysisprop called "query_bsml_file_list" to indicate the file
-- that contained the list of proteins on which the analysis was run:

SELECT a.analysis_id, cv.name, ap.value
FROM analysis a, analysisprop ap, cvterm cv
WHERE a.program = 'jaccard'
AND a.analysis_id = ap.analysis_id
AND ap.type_id = cv.cvterm_id
AND cv.name = 'query_bsml_file_list'
ORDER BY a.analysis_id ASC;

--  analysis_id |         name         |                                 value                                  
-- -------------+----------------------+------------------------------------------------------------------------
--            3 | query_bsml_file_list | /usr/local/annotation/CHADO_TEST/workflow_config_files/pya1_orgs.list
--            4 | query_bsml_file_list | /usr/local/annotation/CHADO_TEST/workflow_config_files/pfa1_orgs.list
--            5 | query_bsml_file_list | /usr/local/annotation/CHADO_TEST/workflow_config_files/cspa1_orgs.list
-- (3 rows)

-- From this it is possible to infer that analysis_id 3 corresponds to
-- a P. yoelii Jaccard cluster analysis, analysis_id 4 to a
-- P. falciparum analysis, and analysis_id 5 to a C. parvum analysis.
-- There are a number of other analysisprops that provide additional
-- information on the configuration of the Sybil analyses, but these 
-- properties will not necessarily be shared with non-TIGR/non-Sybil 
-- chado comparative databases, and may not use a standard ontology/cv.

-- *CHADO USAGE NOTE: In some chado databases the analysis.sourcename
-- column is used to store the name of the file from which the results
-- of the analysis were loaded.  In chado_demo and other Sybil
-- comparative databases, however, the sourcename gives the name of
-- the directory (and, indirectly, the name of a corresponding
-- "workflow id") where these output files were written.  These file
-- and directory names are obviously only useful if one happens to be
-- using a machine that has access to the relevant filesystem(s):

SELECT analysis_id, program, sourcename FROM analysis;

--  analysis_id | program |                       sourcename                       
-- -------------+---------+--------------------------------------------------------
--            1 | blastp  | /usr/local/annotation/CHADO_TEST/Workflow/blastp/13953
--            2 | cogs    | /usr/local/annotation/CHADO_TEST/Workflow/cogs/28646
--            3 | jaccard | /usr/local/annotation/CHADO_TEST/Workflow/jaccard/1358
--            4 | jaccard | /usr/local/annotation/CHADO_TEST/Workflow/jaccard/1364
--            5 | jaccard | /usr/local/annotation/CHADO_TEST/Workflow/jaccard/1363
-- (5 rows)

-- Query for the number of features generated by each analysis:
SELECT a.analysis_id, a.program, COUNT(cf.feature_id)
FROM analysis a, analysisfeature af, feature cf
WHERE a.analysis_id = af.analysis_id
AND af.feature_id = cf.feature_id
GROUP BY a.analysis_id, a.program;
-- NOTE: slow query

--  analysis_id | program |  count  
-- -------------+---------+---------
--            3 | jaccard |     410
--            5 | jaccard |       7
--            4 | jaccard |      71
--            2 | cogs    |    3921
--            1 | blastp  | 4044057
-- (5 rows)

-- There are > 4 million blastp features because each feature corresponds
-- to a BLASTP HSP, and an all-vs-all BLASTP search was run on all of the
-- proteins in the database, with relatively permissive result filtering
-- settings.

-- *CHADO USAGE NOTE: chado_demo does not have any explicit "HSP-grouping" 
-- features.  More recent Sybil chado databases and other sites' chado
-- databases may have additional features that group together all BLAST 
-- HSPs that relate to a given pair of aligned sequences, using the chado
-- feature_relationship table.  Storing these "HSP group" features also
-- allows one to store match statistics (e.g., the BLAST "Sum-P" statistics)
-- that pertain to multiple HSPs, but would increase the feature count
-- even more than is shown above.

-- There are relatively few jaccard cluster features because the Jaccard
-- clustering step is used to group together very closely-related proteins
-- (e.g. close paralogs) within each genome.  The above counts give some
-- indication of the approximate number of close paralogs in each genome.
-- The cogs analysis, on the other hand, groups proteins from 2 or more
-- *distinct* genomes, using best reciprocal blastp matches.  For reasonably
-- closely-related genomes, then, there are usually many more Sybil cogs 
-- than there are Jaccard clusters.

-- Jaccard clusters and Sybil COGs are both encoded in the chado_demo
-- database by using the featureloc table to group together proteins 
-- that belong to the same protein cluster (either a COG or a Jaccard
-- cluster.)  The encoding for a single protein cluster with 3 
-- proteins would look something like this:

--            analysis (1 row shared by all clusters in the analysis)
--                |
--                |
--                |
--         analysisfeature (1 row for each protein cluster)
--         /      |      \
--        /       |       \  <- featureloc.feature_id
--       /        |        \
--  featureloc featureloc featureloc (1 row for each protein *in* the cluster)
--      |         |          |
--      |         |          | <- featureloc.srcfeature_id
--      |         |          |
--   feature    feature    feature  (existing 'protein' features)
--
-- The featureloc.srcfeature_id points to the protein feature, and the
-- featureloc.feature_id points to the protein cluster feature.  This is
-- a somewhat arbitrary convention, but motivated by the idea that in 
-- future comparative databases there might be protein clusters based
-- on a set of proteins with a core set of conserved residues or some 
-- type of shared motif.  Since featureloc.fmin and featureloc.fmax refer 
-- to the position on the srcfeature, these columns could be used to 
-- indicate the position of the "conserved core" or common motif in each 
-- of the clustered proteins.
--
-- *CHADO USAGE NOTE: Older Sybil chado comparative databases used the
-- opposite convention; featureloc.srcfeature_id pointed to the protein
-- cluster feature, and featureloc.feature_id to the individual proteins.

-- Here is a query that retrieves all the proteins that belong to
-- a specific protein cluster (from the "cogs" analysis in this case):
SELECT p.uniquename, fl.fmin, fl.fmax, fl.strand, fl.locgroup, fl.rank
FROM feature cf, featureloc fl, feature p
WHERE cf.uniquename = '3890169_cogs'
AND fl.feature_id = cf.feature_id
AND fl.srcfeature_id = p.feature_id;

--         uniquename        | fmin | fmax | strand | locgroup | rank 
-- --------------------------+------+------+--------+----------+------
--  pya1.804.m00057_protein  |      |      |        |        0 |    1
--  pfa1.2280.m00238_protein |      |      |        |        0 |    2
-- (2 rows)

-- This protein cluster contains only two proteins; one from P. yoelii
-- and the other from P. falciparum.  Note that most of the featureloc
-- columns (fmin, fmax, strand, etc.) are set to NULL.  Although the
-- rank column has been set, the order in which the proteins appear is
-- not particularly significant in these analyses.

-- Another detail of the protein cluster encoding used in chado_demo is
-- that the protein cluster feature itself has a clustalw multiple 
-- sequence alignment associated with it.  This alignment is stored in
-- the "residues" column of the protein cluster feature:

SELECT cf.residues
FROM feature cf
WHERE cf.uniquename = '3890169_cogs';

-- ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-- pya1.804.m00057_protein       MFSNVFKIDK YMHYKNKIKN RTYNLSRVIT NLYEKKKKKK LENVVVRIRK
-- pfa1.2280.m00238_protein      .......... MCAYKDSVKD THHKLK...T NEIIEKTETK LENVVVRIRK
-- pya1.804.m00057_protein       LEQNEEITLQ TDPKDKTALY FKKDFEIEKY TFDLVFDGND NNESIFNEIG
-- pfa1.2280.m00238_protein      LDKNEKSTLH TDPNDRTALY FDKDFEIEKY NFDIVFDEND DNQTIFNKIG
-- pya1.804.m00057_protein       GHEIVNSVCR GYKETIITYG QTGSGKTYTL FGSEKEYGLI YYFIYHLYKL
-- pfa1.2280.m00238_protein      GQFIINNVCN GFKETIITYG QTGSGKTYTL FGSNKEYGII YYFVHYLYKL
-- pya1.804.m00057_protein       CNKKN.YIKS SIYLSIYEIL GDTLIDLMSY QNEKNIEFYT EEYYLKTIKY
-- pfa1.2280.m00238_protein      CNSKNKNKKK AIYLSIYEIL GDTLIDLISN LNEKNIEFYT EEYYLKTIRY
-- pya1.804.m00057_protein       SYKVVNIKSY ESAKKMIDAA CMLRNVEATS QNMRSSRSHA IIQFFVNISE
-- pfa1.2280.m00238_protein      SYKVVNIQNY DIAKKIIDSA CLLRNVEATS QNMRSSRSHA IIQFFVNISD
-- pya1.804.m00057_protein       SNISNNVKTV KDYYGVLTLV DLVGCEREEY NANKNINKYE YKGGNTNDKT
-- pfa1.2280.m00238_protein      CTLHNGIETI KDYYGVLTLV DLVGCEREEF NTNKNE.... ....HRNDKT
-- pya1.804.m00057_protein       SARVLNSSLS SLNKMLRKMQ L......... .......... ..........
-- pfa1.2280.m00238_protein      SSKFLNSSLT SLNKMLRKMQ MGNLDESDKR QSVLCKVLFN YIQKTCGVCL
-- pya1.804.m00057_protein       .......... ...TSSTLTM ANECKKIKSK RKQLFYVNSE SREEFFKNIK
-- pfa1.2280.m00238_protein      IFCFNPKISQ KSLTSSTLIM ASDCKKIKSK RKQLIYVKSE NKESFFKKIA
-- pya1.804.m00057_protein       TEGNTHATTN SGSKKTNAKN ETNEMDETNE NQFKGTKQNN AHSKSTQQID
-- pfa1.2280.m00238_protein      NDDSSKNGNN KEKEKKYEGS KWENYNTIKK N.VENGKDKE EEMEKKHVEN
-- pya1.804.m00057_protein       IHNSDKNDIN DKVKVRNNSS IFVLYTNDDN NK.IINITDK KVDNEKYNLL
-- pfa1.2280.m00238_protein      INNLVEIEEN KYLRLENNCA IFLIYLNDDK QEDVINILNK KKDNGKLNSL
-- pya1.804.m00057_protein       RNLLHEIVEE KMMEDKNNKN TIEQLKNDIS KLKNECNFWK KETYNYHNKL
-- pfa1.2280.m00238_protein      KKILCDIINE QVKEEKKRKN VLEQLKSDNL KLKNENDYWK KEAYNYHNKL
-- pya1.804.m00057_protein       KMLNKNYLKI NEFLFKTLNN N......S.. .......... ....IN....
-- pfa1.2280.m00238_protein      KILNKNYLKM NEFLLKTLNA STNDNNTSSD YTTSLFSKLD EKKGINDERQ
-- pya1.804.m00057_protein       .......... ..STNSNDTK INYNTRQNSN .....S.... ..........
-- pfa1.2280.m00238_protein      KNQTRGKNDM NLSSHNNDDT INNDNDDNND DNNDDNNIDN NDNNIDNIHN
-- pya1.804.m00057_protein       .....PNS.. ..YHNYSLKN YPNETRKYEK SQS.....LS TNLNRKLCED
-- pfa1.2280.m00238_protein      NIDNIHNNNN YYYNNQVVNT YAQLKQKKQL VQNKSAHILN NSSSDILRTS
-- pya1.804.m00057_protein       NKNIPISPCT .....HK... .......... ....K....E TIDIENSYLK
-- pfa1.2280.m00238_protein      DKGNSHNPTT YYNEDHLLLH KNNTQFIEEK NKQRKEYYKE VINNKGSNIN
-- pya1.804.m00057_protein       KKNSIS.... PDSFR..TKK NLENEE.... ........N. ........SS
-- pfa1.2280.m00238_protein      KADDQTNNRI EDNLRNHILN NINNEKNYIN DYTNNKSYQE EIPYPNKEDN
-- pya1.804.m00057_protein       KRTSG..... ........YS KINSERY... ...N...LKS KPITEIYQIN
-- pfa1.2280.m00238_protein      KHRSNKMMLQ NNMIKGEIYR KNIYEKYKEN GVTNNMEIEM KNKKKINNLN
-- pya1.804.m00057_protein       EQGLEKN... ..S....... .....FYNKH RSFSEAYN.. ..........
-- pfa1.2280.m00238_protein      SQNIQRNNSM KETTNMLSID NINKKYYEQY KNISPENNTC IITDKLNLLQ
-- pya1.804.m00057_protein       .......... .....YDEKK ITK....... .......... ..........
-- pfa1.2280.m00238_protein      KKKDTTKNII HPLSNNDNKM LTNITIDSLA TKIKNRLLKS RSLSIAREHK
-- pya1.804.m00057_protein       
-- pfa1.2280.m00238_protein      
--
-- (1 row)

-- Note that "singleton" clusters (i.e., clusters that contain exactly
-- 1 protein) are not represented explicitly in chado_demo (or any
-- other Sybil comparative database thus far).  Only clusters of size
-- 2 or greater are given an entry in the chado feature table.  If
-- singleton clusters *were* stored explicitly, then it would be
-- possible--for these particular analysis types--to determine the
-- input protein set by looking at the output protein clusters.
-- Another property of the Sybil cluster analyses is that the output
-- clusters never overlap; in other words, each input protein will be
-- assigned to exactly one output cluster by the clustering algorithm(s)
-- (although some proteins may be assigned to singleton clusters, which
-- must be inferred.)

-- Proteins clustered into non-singleton clusters by the Sybil COGs analysis:
SELECT o.genus, o.species, COUNT(*) AS num_proteins
FROM analysisfeature af, feature cf, featureloc fl, feature p, organism o
WHERE af.analysis_id = 2
AND af.feature_id = cf.feature_id
AND fl.feature_id = cf.feature_id
AND fl.srcfeature_id = p.feature_id
AND p.organism_id = o.organism_id
GROUP BY o.genus, o.species;
-- NOTE: slow query

--       genus      |  species   | num_proteins 
-- -----------------+------------+--------------
--  Plasmodium      | yoelii     |         3885
--  Plasmodium      | falciparum |         3928
--  Cryptosporidium | parvum     |         1369
-- (3 rows)

-- For the Jaccard cluster analyses, which are strictly single-genome, a
-- variant of the above query gives a way to determine which analysis 
-- corresponds to which input genome:
SELECT a.analysis_id, o.genus, o.species, COUNT(*) AS num_proteins
FROM analysis a, analysisfeature af, feature cf, featureloc fl, feature p, organism o
WHERE a.program = 'jaccard'
AND a.analysis_id = af.analysis_id
AND af.feature_id = cf.feature_id
AND fl.feature_id = cf.feature_id
AND fl.srcfeature_id = p.feature_id
AND p.organism_id = o.organism_id
GROUP BY a.analysis_id, o.genus, o.species;
-- NOTE: slow query

--  analysis_id |      genus      |  species   | num_proteins 
-- -------------+-----------------+------------+--------------
--            5 | Cryptosporidium | parvum     |           16
--            3 | Plasmodium      | yoelii     |         1110
--            4 | Plasmodium      | falciparum |          197
-- (3 rows)

-- Note that this agrees with the Jaccard analysis - genome mapping
-- that we were able to infer by looking at the query_bsml_file_list
-- analysisprop.

-- Query for all the clusters to which a given protein belongs.
SELECT cf.uniquename, a.program
FROM feature p, featureloc fl, feature cf, analysisfeature af, analysis a
WHERE p.uniquename = 'pfa1.2280.m00238_protein'
AND p.feature_id = fl.srcfeature_id
AND fl.feature_id = af.feature_id
AND af.feature_id = cf.feature_id
AND af.analysis_id = a.analysis_id
AND a.program in ('jaccard', 'cogs');

--   uniquename  | program 
-- --------------+---------
--  3890169_cogs | cogs
-- (1 row)

-- *CHADO USAGE NOTE: Newer Sybil comparative databases use the Sequence
-- Ontology (SO) to describe features and in the latest chado databases,
-- the SO term 'polypeptide' is used instead of 'protein'.