Source code for sramongo.models

"""Document models for MongoDB"""
from datetime import datetime

from mongoengine import Document, EmbeddedDocument
from mongoengine import (
    StringField,
    IntField,
    FloatField,
    ListField,
    DictField,
    DateTimeField,
    MapField,
)
from mongoengine import EmbeddedDocumentField


class Attribute(EmbeddedDocument):
    name = StringField()
    value = StringField()


[docs]class Organization(EmbeddedDocument):
    """Organization embedded document.

    An organization contains information about the group that submitted to sra.
    For example, all data submitted to GEO are submitted to SRA using the GEO
    credentials.

    Attributes
    ----------
    organization_type: str
        Weather this organization is a center or individual or some other kind
        of group.

    abbreviation: str
        A short name for the organization.

    name: str
        Name of the organization.

    emai: str
        Contact email address.

    first_name: str
        First name of the person who submitted the data.

    last_name: str
        First name of the person who submitted the data.

    """

    organization_type = StringField()
    abbreviation = StringField()
    name = StringField()
    email = StringField()
    first_name = StringField()
    last_name = StringField()


[docs]class Study(EmbeddedDocument):
    """The contents of a SRA study.

    A study consists of a set of experiments designed with an overall goal in
    mind. For example, this could include a control experiment and a treatment
    experiment with the goal being to identify expression differences resulting
    from the treatment. The SRA study is the top level of the submission
    hierarchy.

    Attributes
    ----------
    accn: mongoengine.StringField
        The primary identifier for a study. Identifiers begin with
        SRP/ERP/DRP depending on which database they originate from.

    bioproject: mongoengine.StringField
        The associated BioProject identifier.

    geo: mongoengine.StringField
        The associated GEO identifier.

    geo: mongoengine.StringField
        The associated Pubmed identifiers.

    title: mongoengine.StringField
        The title of the study.

    abstract: mongoengine.StringField
        Abstract of the study.

    center_name: mongoengine.StringField
        Name of the submitting center.

    center_project_name: mongoengine.StringField
        Center specific identifier for the study.

    description: mongoengine.StringField
        Additional text describing the study.

    """

    accn = StringField()

    # External IDs
    bioproject = StringField()
    geo = StringField()
    pubmed = ListField(IntField())

    # Attributes
    title = StringField()
    abstract = StringField()
    center_name = StringField()
    center_project_name = StringField()
    description = StringField()


[docs]class Sample(EmbeddedDocument):
    """The contents of a SRA sample.

    A sample is the biological unit. An individual sample or a pool of samples
    can be use in the SRA Experiment. This document contains information
    describing the sample ranging from species information to detailed
    descriptions of what and how material was collected.

    Attributes
    ----------
    accn: mongoengine.StringField
        The primary identifier for a sample. Identifiers begin with
        SRS/ERS/DRS depending on which database they originate from.

    biosample: mongoengine.StringField
        The associated BioSample identifier.

    geo: mongoengine.StringField
        The associated GEO identifier.

    title: mongoengine.StringField
        The title of the sample.

    taxon_id: mongoengine.IntField
        The NCBI taxon id.

    scientific_name: mongoengine.StringField
        The scientific name.

    common_name: mongoengine.StringField
        The common name.

    attributes: mongoengine.DictField
        A set of key:value pairs describing the sample. For example tissue:ovary
        or sex:female.

    """

    # SRS/DRS/ERS
    accn = StringField()

    # External IDs
    biosample = StringField()
    geo = StringField()

    # Attributes
    title = StringField()
    taxon_id = IntField()
    scientific_name = StringField()
    common_name = StringField()
    attributes = ListField(EmbeddedDocumentField(Attribute), default=list)


[docs]class Run(EmbeddedDocument):
    """Run Document.

    A Run describes a dataset generated from an Experiment. For example if a
    Experiment is sequenced on multiple lanes of a Illumina flowcell then data
    from each lane are considered a Run.

    Attributes
    ----------
    srr: mongoengine.StringField
        The primary identifier for a run. Identifiers begin with
        SRR/ERR/DRR depending on which database they originate from.

    nspots: mongoengine.IntField
        The total number of spots on a Illumina flowcell.

    nbases: mongoengine.IntField
        The number of bases.

    nreads: mongoengine.IntField
        The number of reads.

    read_count_r1: mongoengine.FloatField
        Some Runs have additional information on reads. This is the number of
        reads from single ended or the first read pair in pair ended data.

    read_len_r1: mongoengine.FloatField
        This is the average length of reads from single ended or the first read
        pair in pair ended data.

    read_count_r2: mongoengine.FloatField
        This is the number of reads from the second read pair in pair ended
        data.

    read_len_r2: mongoengine.FloatField
        This is the avearge length of reads from the second read pair in pair
        ended data.

    release_date: mongoengine.DateTimeField
        Release date of the Run. This information is from the runinfo table and
        not the XML.

    load_date: mongoengine.DateTimeField
        Date the Run was uploaded. This information is from the runinfo table
        and not the XML.

    size_MB: mongoengine.IntField
        Size of the Run file. This information is from the runinfo table and not
        the XML.

    """

    # SRR/DRR/ERR
    srr = StringField()

    # Attributes
    nspots = IntField()
    nbases = IntField()
    nreads = IntField()

    # if single ended then just use _r1
    read_count_r1 = FloatField()
    read_len_r1 = FloatField()

    read_count_r2 = FloatField()
    read_len_r2 = FloatField()

    # NOTE: Additional Fields not in the SRA XML but in summary table
    release_date = DateTimeField()
    load_date = DateTimeField()
    size_MB = IntField()


class Geo(EmbeddedDocument):
    # TODO add geo parser
    accn = StringField()
    GEO_Dataset = StringField()
    sramongo_last_updated = DateTimeField(default=datetime.utcnow())


[docs]class BioProject(EmbeddedDocument):
    """The contents of a BioProject.

    BioProject is another database housed at NCBI which records project
    metadata.  This information should already be present in the SRA
    information, but to be safe we can pull into the BioProject for additional
    metadata.

    Attributes
    ----------
    accn: mongoengine.StringField
        The primary identifier for a BioProject. These are the accession number
        which begin with PRJ.

    id: mongoengine.IntField
        The primary identifier for a BioProject. These are the id numbers.

    name: mongoengine.StringField
        A brief name of the project.

    title: mongoengine.StringField
        The title of the project.

    description: mongoengine.StringField
        A short description of the project.

    last_date: mongoengine.DateTimeField
        Last date the BioProject was updated.

    submission_date: mongoengine.DateTimeField
        Date the BioProject was submitted.

    """

    accn = StringField()
    bioproject_id = IntField()
    name = StringField()
    title = StringField()
    description = StringField()
    last_update = DateTimeField()
    submission_date = DateTimeField()

    sramongo_last_updated = DateTimeField(default=datetime.utcnow())


[docs]class BioSample(EmbeddedDocument):
    """The contents of a BioSample.

    BioSample is another database housed at NCBI which records sample metadata.
    This information should already be present in the Sra.sample information,
    but to be safe we can pull into the BioSample for additional metadata.

    Attributes
    ----------
    accn: mongoengine.StringField
        The primary identifier for a BioSample. These are the accession number
        which begin with SAM.

    id: mongoengine.IntField
        The primary identifier for a BioSample. These are the id number.

    title: mongoengine.StringField
        A free text description of the sample.

    description: mongoengine.StringField
        A free text description of the sample.

    publication_date: mongoengine.StringField
        Date the sample was published.

    last_update: mongoengine.StringField
        Last time BioSample updated sample information.

    submission_date: mongoengine.StringField
        Date the sample was submitted

    attributes: mongoengine.ListField of mongoengine.DictField
        A list of dictionaries containing key:value pairs describing the
        experiment. The stored dictionaries are of the form {'name': value,
        'value': value}. This was done to make querying easier.

    """

    accn = StringField()
    biosample_id = IntField()
    title = StringField()
    description = StringField()
    last_update = StringField()
    submission_date = StringField()
    contacts = ListField(DictField(), default=list)
    attributes = ListField(EmbeddedDocumentField(Attribute), default=list)
    sramongo_last_updated = DateTimeField(default=datetime.utcnow())


[docs]class Pubmed(EmbeddedDocument):
    """The contents of a Pubmed document.

    This document contains specific information about publications.

    Attributes
    ----------
    accn: mongoengine.StringField
        The primary identifier for Pubmed. These are the accession number
        which begin with PMID.

    title: mongoengine.StringField
        Title of the paper.

    abstract: mongoengine.StringField
        Paper abstract.

    authors: mongoengine.ListField
        List of authors.

    citation: mongoengine.StringField
        Citation information for the paper.

    date_created: mongoengine.DateTimeField
        Date the pubmed entry was created.

    date_completed: mongoengine.DateTimeField
        Date the pubmed entry was completed.

    date_revised: mongoengine.DateTimeField
        Date the pubmed entry was last updated.

    """

    accn = StringField()
    title = StringField()
    abstract = StringField()
    authors = ListField(DictField())
    citation = StringField()
    date_created = DateTimeField()
    date_completed = DateTimeField()
    date_revised = DateTimeField()
    sramongo_last_updated = DateTimeField(default=datetime.utcnow())


[docs]class SraDocument(Document):
    srx = StringField()
    sra_id = IntField()
    title = StringField()
    design = StringField()

    sramongo_last_updated = DateTimeField(default=datetime.utcnow())

    sra_create_date = DateTimeField()
    sra_update_date = DateTimeField()

    # Technical Attributes
    library_name = StringField()
    library_strategy = StringField()
    library_source = StringField()
    library_selection = StringField()
    library_layout = StringField()
    library_layout_length = StringField()
    library_layout_sdev = StringField()
    library_construction_protocol = StringField()
    platform = StringField()
    instrument_model = StringField()

    # Embedded Documents
    organization = EmbeddedDocumentField(Organization)
    study = EmbeddedDocumentField(Study)
    sample = EmbeddedDocumentField(Sample)
    runs = ListField(EmbeddedDocumentField(Run))
    BioProject = EmbeddedDocumentField(BioProject)
    BioSmaple = EmbeddedDocumentField(BioSample)
    papers = ListField(EmbeddedDocumentField(Pubmed))
    Geo = EmbeddedDocumentField(Geo)


class TaxRecord(EmbeddedDocument):
    parent = StringField()
    total_count = IntField()
    self_count = IntField()
    tax_id = StringField()
    name = StringField()


class TaxAnalysis(Document):
    """
        A dictionary containing results from a taxonomic analysis. Some Runs are
        analyzed and the number of reads that align to different taxa are
        recorded. The taxanomic analysis is stored in the SRA as a hierarchy,
        but it is stored here as a flattend dictionary for easier access to
        different classes. Basic structure is:

        'nspoot_analyze': The number of spots analyzed,
        'total_spots': The total number of spots,
        'mapped_spots': The number of spots that were able to be mapped,
        'tax_count': A dictionary containing actual taxonomic counts organized by level in the tree of life

        'kingdom':
        ...
        'species':
            'parent':
                Name of parent level.
            'total_count':
                Number of mapped spots at this level and below.
            'self_count':
                Number of mapped spots at this level.
            'tax_id':
                taxonomic identifier.
            'name':
                of this taxonomy.
        'subspeciies':
        ...
    """

    srr = StringField()
    nspot_analyze = IntField()
    total_spots = IntField()
    mapped_spots = IntField()
    tax_counts = MapField(ListField(EmbeddedDocumentField(TaxRecord), default=list))