Source code for MDAnalysis.topology.MMCIFParser

# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
#
"""
MMCIF Topology Parser
====================

Read topology information from mmCIF/PDBx coordinate files using the `Gemmi library <https://github.com/project-gemmi/gemmi>`_

mmCIF files contain topology information about the molecules in the structure. For each atom the following attributes are read
and stored in the relevant topology attributes:
    - :class:`..core.topologyattrs.AtomAttr` subclasses:
        - :class:`MDAnalysis.core.topologyattrs.AltLocs`
        - :class:`MDAnalysis.core.topologyattrs.Atomids`
        - :class:`MDAnalysis.core.topologyattrs.Atomnames`
        - :class:`MDAnalysis.core.topologyattrs.Atomtypes`
        - :class:`MDAnalysis.core.topologyattrs.ChainIDs`
        - :class:`MDAnalysis.core.topologyattrs.Elements`
        - :class:`MDAnalysis.core.topologyattrs.FormalCharges`
        - :class:`MDAnalysis.core.topologyattrs.Masses`
        - :class:`MDAnalysis.core.topologyattrs.Occupancies`
        - :class:`MDAnalysis.core.topologyattrs.RecordTypes`
        - :class:`MDAnalysis.core.topologyattrs.Tempfactors`
    - :class:`..core.topologyattrs.ResidueAttr` subclasses:
        - :class:`MDAnalysis.core.topologyattrs.Resnums`
        - :class:`MDAnalysis.core.topologyattrs.ICodes`
        - :class:`MDAnalysis.core.topologyattrs.Resids`
        - :class:`MDAnalysis.core.topologyattrs.Resnames`
    - :class:`..core.topologyattrs.SegmentAttr` subclasses:
        - :class:`MDAnalysis.core.topologyattrs.Segids`

Classes
-------

.. autoclass:: MMCIFParser
   :members:
   :inherited-members:

.. versionadded:: 2.9.0
"""

try:
    import gemmi
except ImportError:
    HAS_GEMMI = False
else:
    HAS_GEMMI = True

import warnings

import numpy as np

from ..core.topology import Topology
from ..core.topologyattrs import (
    AltLocs,
    Atomids,
    Atomnames,
    Atomtypes,
    ChainIDs,
    Elements,
    FormalCharges,
    ICodes,
    Masses,
    Occupancies,
    RecordTypes,
    Resids,
    Resnames,
    Resnums,
    Segids,
    Tempfactors,
)
from .base import TopologyReaderBase, change_squash


[docs]class MMCIFParser(TopologyReaderBase): """Parser that obtains a list of atoms from a standard MMCIF/PDBx file using ``gemmi`` library (https://github.com/project-gemmi/gemmi). Creates the following Attributes (if present): - :class:`..core.topologyattrs.AtomAttr` subclasses: - :class:`..core.topologyattrs.AltLocs` - :class:`..core.topologyattrs.Atomids` - :class:`..core.topologyattrs.Atomnames` - :class:`..core.topologyattrs.Atomtypes` - :class:`..core.topologyattrs.ChainIDs` - :class:`..core.topologyattrs.Elements` - :class:`..core.topologyattrs.FormalCharges` - :class:`..core.topologyattrs.Masses` - :class:`..core.topologyattrs.Occupancies` - :class:`..core.topologyattrs.RecordTypes` - :class:`..core.topologyattrs.Tempfactors` - :class:`..core.topologyattrs.ResidueAttr` subclasses: - :class:`..core.topologyattrs.Resnums` - :class:`..core.topologyattrs.ICodes` - :class:`..core.topologyattrs.Resids` - :class:`..core.topologyattrs.Resnames` - :class:`..core.topologyattrs.SegmentAttr` subclasses: - :class:`..core.topologyattrs.Segids` .. versionadded:: 2.9.0 """ format = ["cif", "cif.gz", "mmcif", "mmcif.gz", "pdb_gemmi"]
[docs] def parse(self, **kwargs) -> Topology: """Read the file and return the structure. Returns ------- MDAnalysis Topology object """ structure = gemmi.read_structure(self.filename) if len(structure) > 1: warnings.warn( f"MMCIF model {self.filename} contains {len(structure)=} different models, " "but only the first one will be used to assign the topology" ) model = structure[0] ( altlocs, # at.altloc serials, # at.serial names, # at.name atomtypes, # at.name # ------------------ chainids, # chain.name elements, # at.element.name formalcharges, # at.charge weights, # at.element.weight # ------------------ occupancies, # at.occ record_types, # res.het_flag tempfactors, # at.b_iso # ------------------ icodes, # residue.seqid.icode resids, # residue.seqid.num resnames, # residue.name ) = map( # this construct takes np.ndarray of all lists of attributes, extracted from the `gemmi.Model` np.array, list( zip( *[ ( # tuple of attributes # extracted from residue, atom or chain in the structure # ------------------ atom.altloc, # altlocs atom.serial, # serials atom.name, # names atom.name, # atomtypes # ------------------ chain.name, # chainids atom.element.name, # elements atom.charge, # formalcharges atom.element.weight, # weights # ------------------ atom.occ, # occupancies residue.het_flag, # record_types atom.b_iso, # tempfactors # ------------------ residue.seqid.icode, # icodes residue.seqid.num, # resids residue.name, # resnames ) # the main loop over the `gemmi.Model` object for chain in model for residue in chain for atom in residue ] ) ), ) # fill in altlocs, since gemmi has '' as default altlocs = ["A" if not elem else elem for elem in altlocs] # convert default gemmi record types to default MDAnalysis record types record_types = [ "ATOM" if record == "A" else "HETATM" if record == "H" else None for record in record_types ] if any((elem is None for elem in record_types)): raise ValueError("Found an atom that is neither ATOM or HETATM") # Atom Attr's attrs = [ AltLocs(altlocs), Atomids(serials), Atomnames(names), Atomtypes(atomtypes), # ---------------------------- ChainIDs(chainids), Elements(elements), FormalCharges(formalcharges), Masses(weights), # ---------------------------- Occupancies(occupancies), RecordTypes(record_types), Tempfactors(tempfactors), ] n_atoms = len(altlocs) # Residue Attr's residx, (resids, resnames, icodes, chainids) = change_squash( (resids, resnames, icodes, chainids), (resids, resnames, icodes, chainids), ) attrs.append(Resids(resids)) attrs.append(Resnames(resnames)) attrs.append(Resnums(resids.copy())) attrs.append(ICodes([icode.strip() for icode in icodes])) n_residues = len(resids) # Segment Attr's segidx, (segids,) = change_squash((chainids,), (chainids,)) attrs.append(Segids(segids)) n_segments = len(segids) return Topology( n_atoms, n_residues, n_segments, attrs=attrs, atom_resindex=residx, residue_segindex=segidx, )