Source code for sramongo.models

"""Document models for MongoDB"""
from datetime import datetime

from mongoengine import Document, EmbeddedDocument
from mongoengine import (
    StringField,
    IntField,
    FloatField,
    ListField,
    DictField,
    DateTimeField,
    MapField,
)
from mongoengine import EmbeddedDocumentField


class Attribute(EmbeddedDocument):
    name = StringField()
    value = StringField()


[docs]class Organization(EmbeddedDocument): """Organization embedded document. An organization contains information about the group that submitted to sra. For example, all data submitted to GEO are submitted to SRA using the GEO credentials. Attributes ---------- organization_type: str Weather this organization is a center or individual or some other kind of group. abbreviation: str A short name for the organization. name: str Name of the organization. emai: str Contact email address. first_name: str First name of the person who submitted the data. last_name: str First name of the person who submitted the data. """ organization_type = StringField() abbreviation = StringField() name = StringField() email = StringField() first_name = StringField() last_name = StringField()
[docs]class Study(EmbeddedDocument): """The contents of a SRA study. A study consists of a set of experiments designed with an overall goal in mind. For example, this could include a control experiment and a treatment experiment with the goal being to identify expression differences resulting from the treatment. The SRA study is the top level of the submission hierarchy. Attributes ---------- accn: mongoengine.StringField The primary identifier for a study. Identifiers begin with SRP/ERP/DRP depending on which database they originate from. bioproject: mongoengine.StringField The associated BioProject identifier. geo: mongoengine.StringField The associated GEO identifier. geo: mongoengine.StringField The associated Pubmed identifiers. title: mongoengine.StringField The title of the study. abstract: mongoengine.StringField Abstract of the study. center_name: mongoengine.StringField Name of the submitting center. center_project_name: mongoengine.StringField Center specific identifier for the study. description: mongoengine.StringField Additional text describing the study. """ accn = StringField() # External IDs bioproject = StringField() geo = StringField() pubmed = ListField(IntField()) # Attributes title = StringField() abstract = StringField() center_name = StringField() center_project_name = StringField() description = StringField()
[docs]class Sample(EmbeddedDocument): """The contents of a SRA sample. A sample is the biological unit. An individual sample or a pool of samples can be use in the SRA Experiment. This document contains information describing the sample ranging from species information to detailed descriptions of what and how material was collected. Attributes ---------- accn: mongoengine.StringField The primary identifier for a sample. Identifiers begin with SRS/ERS/DRS depending on which database they originate from. biosample: mongoengine.StringField The associated BioSample identifier. geo: mongoengine.StringField The associated GEO identifier. title: mongoengine.StringField The title of the sample. taxon_id: mongoengine.IntField The NCBI taxon id. scientific_name: mongoengine.StringField The scientific name. common_name: mongoengine.StringField The common name. attributes: mongoengine.DictField A set of key:value pairs describing the sample. For example tissue:ovary or sex:female. """ # SRS/DRS/ERS accn = StringField() # External IDs biosample = StringField() geo = StringField() # Attributes title = StringField() taxon_id = IntField() scientific_name = StringField() common_name = StringField() attributes = ListField(EmbeddedDocumentField(Attribute), default=list)
[docs]class Run(EmbeddedDocument): """Run Document. A Run describes a dataset generated from an Experiment. For example if a Experiment is sequenced on multiple lanes of a Illumina flowcell then data from each lane are considered a Run. Attributes ---------- srr: mongoengine.StringField The primary identifier for a run. Identifiers begin with SRR/ERR/DRR depending on which database they originate from. nspots: mongoengine.IntField The total number of spots on a Illumina flowcell. nbases: mongoengine.IntField The number of bases. nreads: mongoengine.IntField The number of reads. read_count_r1: mongoengine.FloatField Some Runs have additional information on reads. This is the number of reads from single ended or the first read pair in pair ended data. read_len_r1: mongoengine.FloatField This is the average length of reads from single ended or the first read pair in pair ended data. read_count_r2: mongoengine.FloatField This is the number of reads from the second read pair in pair ended data. read_len_r2: mongoengine.FloatField This is the avearge length of reads from the second read pair in pair ended data. release_date: mongoengine.DateTimeField Release date of the Run. This information is from the runinfo table and not the XML. load_date: mongoengine.DateTimeField Date the Run was uploaded. This information is from the runinfo table and not the XML. size_MB: mongoengine.IntField Size of the Run file. This information is from the runinfo table and not the XML. """ # SRR/DRR/ERR srr = StringField() # Attributes nspots = IntField() nbases = IntField() nreads = IntField() # if single ended then just use _r1 read_count_r1 = FloatField() read_len_r1 = FloatField() read_count_r2 = FloatField() read_len_r2 = FloatField() # NOTE: Additional Fields not in the SRA XML but in summary table release_date = DateTimeField() load_date = DateTimeField() size_MB = IntField()
class Geo(EmbeddedDocument): # TODO add geo parser accn = StringField() GEO_Dataset = StringField() sramongo_last_updated = DateTimeField(default=datetime.utcnow())
[docs]class BioProject(EmbeddedDocument): """The contents of a BioProject. BioProject is another database housed at NCBI which records project metadata. This information should already be present in the SRA information, but to be safe we can pull into the BioProject for additional metadata. Attributes ---------- accn: mongoengine.StringField The primary identifier for a BioProject. These are the accession number which begin with PRJ. id: mongoengine.IntField The primary identifier for a BioProject. These are the id numbers. name: mongoengine.StringField A brief name of the project. title: mongoengine.StringField The title of the project. description: mongoengine.StringField A short description of the project. last_date: mongoengine.DateTimeField Last date the BioProject was updated. submission_date: mongoengine.DateTimeField Date the BioProject was submitted. """ accn = StringField() bioproject_id = IntField() name = StringField() title = StringField() description = StringField() last_update = DateTimeField() submission_date = DateTimeField() sramongo_last_updated = DateTimeField(default=datetime.utcnow())
[docs]class BioSample(EmbeddedDocument): """The contents of a BioSample. BioSample is another database housed at NCBI which records sample metadata. This information should already be present in the Sra.sample information, but to be safe we can pull into the BioSample for additional metadata. Attributes ---------- accn: mongoengine.StringField The primary identifier for a BioSample. These are the accession number which begin with SAM. id: mongoengine.IntField The primary identifier for a BioSample. These are the id number. title: mongoengine.StringField A free text description of the sample. description: mongoengine.StringField A free text description of the sample. publication_date: mongoengine.StringField Date the sample was published. last_update: mongoengine.StringField Last time BioSample updated sample information. submission_date: mongoengine.StringField Date the sample was submitted attributes: mongoengine.ListField of mongoengine.DictField A list of dictionaries containing key:value pairs describing the experiment. The stored dictionaries are of the form {'name': value, 'value': value}. This was done to make querying easier. """ accn = StringField() biosample_id = IntField() title = StringField() description = StringField() last_update = StringField() submission_date = StringField() contacts = ListField(DictField(), default=list) attributes = ListField(EmbeddedDocumentField(Attribute), default=list) sramongo_last_updated = DateTimeField(default=datetime.utcnow())
[docs]class Pubmed(EmbeddedDocument): """The contents of a Pubmed document. This document contains specific information about publications. Attributes ---------- accn: mongoengine.StringField The primary identifier for Pubmed. These are the accession number which begin with PMID. title: mongoengine.StringField Title of the paper. abstract: mongoengine.StringField Paper abstract. authors: mongoengine.ListField List of authors. citation: mongoengine.StringField Citation information for the paper. date_created: mongoengine.DateTimeField Date the pubmed entry was created. date_completed: mongoengine.DateTimeField Date the pubmed entry was completed. date_revised: mongoengine.DateTimeField Date the pubmed entry was last updated. """ accn = StringField() title = StringField() abstract = StringField() authors = ListField(DictField()) citation = StringField() date_created = DateTimeField() date_completed = DateTimeField() date_revised = DateTimeField() sramongo_last_updated = DateTimeField(default=datetime.utcnow())
[docs]class SraDocument(Document): srx = StringField() sra_id = IntField() title = StringField() design = StringField() sramongo_last_updated = DateTimeField(default=datetime.utcnow()) sra_create_date = DateTimeField() sra_update_date = DateTimeField() # Technical Attributes library_name = StringField() library_strategy = StringField() library_source = StringField() library_selection = StringField() library_layout = StringField() library_layout_length = StringField() library_layout_sdev = StringField() library_construction_protocol = StringField() platform = StringField() instrument_model = StringField() # Embedded Documents organization = EmbeddedDocumentField(Organization) study = EmbeddedDocumentField(Study) sample = EmbeddedDocumentField(Sample) runs = ListField(EmbeddedDocumentField(Run)) BioProject = EmbeddedDocumentField(BioProject) BioSmaple = EmbeddedDocumentField(BioSample) papers = ListField(EmbeddedDocumentField(Pubmed)) Geo = EmbeddedDocumentField(Geo)
class TaxRecord(EmbeddedDocument): parent = StringField() total_count = IntField() self_count = IntField() tax_id = StringField() name = StringField() class TaxAnalysis(Document): """ A dictionary containing results from a taxonomic analysis. Some Runs are analyzed and the number of reads that align to different taxa are recorded. The taxanomic analysis is stored in the SRA as a hierarchy, but it is stored here as a flattend dictionary for easier access to different classes. Basic structure is: 'nspoot_analyze': The number of spots analyzed, 'total_spots': The total number of spots, 'mapped_spots': The number of spots that were able to be mapped, 'tax_count': A dictionary containing actual taxonomic counts organized by level in the tree of life 'kingdom': ... 'species': 'parent': Name of parent level. 'total_count': Number of mapped spots at this level and below. 'self_count': Number of mapped spots at this level. 'tax_id': taxonomic identifier. 'name': of this taxonomy. 'subspeciies': ... """ srr = StringField() nspot_analyze = IntField() total_spots = IntField() mapped_spots = IntField() tax_counts = MapField(ListField(EmbeddedDocumentField(TaxRecord), default=list))