食谱

使用 Biopython 读取大型 PDB 文件。

如果使用 Biopython 处理由分子动力学 (MD) 代码生成的 **PDB** 文件，则在读取时会很快遇到原子丢失的问题。典型的错误消息指出会丢失原子

WARNING: Residue (' ', 1, ' ') redefined at line 31.
PDBConstructionException: Blank altlocs in duplicate residue SOL (' ', 1, ' ') at line 31.
Exception ignored.
Some atoms or residues will be missing in the data structure.

问题很简单，这些文件可能很大，包含数十万个原子和残基（例如，每个水分子都是一个独立的残基），而 PDB 格式在 ATOM 或 HETATM 记录的相应列中没有足够的空间来容纳原子编号（serial）>99,999 和残基编号（resSeq）> 9999。因此，这些数字只是以 100,000（serial）或 10,000（resSeq）为模写入。这在链中创建了重复条目，并且 Bio.PDB.PDBParser（更确切地说，是 StructureBuilder）会抱怨。其影响是，并非所有原子都被读取。

以下代码从 Bio.PDB.StructureBuilder.StructureBuilder 派生了一个新类，该类只在必要时增加 resSeq。由于它不是非常小心，所以被称为 SloppyStructureBuilder。

还有一个名为 SloppyPDBIO 的新类，它使用 serial 和 resSeq 包装写入 pdb 文件，以便生成的 pdb 文件符合合法 PDB 格式。

示例

加载一个大型 pdb 文件，并再次将其写入。如何对 pdb 文件执行有趣的操作，例如删除围绕配体的某些水分子，将在另一篇文章中介绍。

from Bio.PDB import PDBParser
import xpdb  # this is the module described below

# read
sloppyparser = PDBParser(
    PERMISSIVE=True, structure_builder=xpdb.SloppyStructureBuilder()
)
structure = sloppyparser.get_structure("MD_system", "my_big_fat.pdb")

# ... do something here ...

# write
sloppyio = xpdb.SloppyPDBIO()
sloppyio.set_structure(structure)
sloppyio.save("new_big_fat.pdb")

类

这是 Python 实现。将其存储为一个名为 xpdb.py 的模块，该模块位于 PYTHONPATH 的某个位置。

# xpdb.py -- extensions to Bio.PDB
# (c) 2009 Oliver Beckstein
# Relased under the same license as Biopython.
# See https://biopython.pythonlang.cn/wiki/Reading_large_PDB_files

import sys
import Bio.PDB
import Bio.PDB.StructureBuilder
from Bio.PDB.Residue import Residue


class SloppyStructureBuilder(Bio.PDB.StructureBuilder.StructureBuilder):
    """Cope with resSeq < 10,000 limitation by just incrementing internally.

    # Q: What's wrong here??
    #   Some atoms or residues will be missing in the data structure.
    #   WARNING: Residue (' ', 8954, ' ') redefined at line 74803.
    #   PDBConstructionException: Blank altlocs in duplicate residue SOL
    #   (' ', 8954, ' ') at line 74803.
    #
    # A: resSeq only goes to 9999 --> goes back to 0 (PDB format is not really
    #    good here)
    """

    # NOTE/TODO:
    # - H and W records are probably not handled yet (don't have examples
    #   to test)

    def __init__(self, verbose=False):
        Bio.PDB.StructureBuilder.StructureBuilder.__init__(self)
        self.max_resseq = -1
        self.verbose = verbose

    def init_residue(self, resname, field, resseq, icode):
        """Initiate a new Residue object.

        Arguments:
        o resname - string, e.g. "ASN"
        o field - hetero flag, "W" for waters, "H" for
            hetero residues, otherwise blanc.
        o resseq - int, sequence identifier
        o icode - string, insertion code

        """
        if field != " ":
            if field == "H":
                # The hetero field consists of
                # H_ + the residue name (e.g. H_FUC)
                field = "H_" + resname
        res_id = (field, resseq, icode)

        if resseq > self.max_resseq:
            self.max_resseq = resseq

        if field == " ":
            fudged_resseq = False
            while self.chain.has_id(res_id) or resseq == 0:
                # There already is a residue with the id (field, resseq, icode)
                # resseq == 0 catches already wrapped residue numbers which
                # do not trigger the has_id() test.
                #
                # Be sloppy and just increment...
                # (This code will not leave gaps in resids... I think)
                #
                # XXX: shouldn't we also do this for hetero atoms and water??
                self.max_resseq += 1
                resseq = self.max_resseq
                res_id = (field, resseq, icode)  # use max_resseq!
                fudged_resseq = True

            if fudged_resseq and self.verbose:
                sys.stderr.write(
                    "Residues are wrapping (Residue "
                    + "('%s', %i, '%s') at line %i)."
                    % (field, resseq, icode, self.line_counter)
                    + ".... assigning new resid %d.\n" % self.max_resseq
                )
        residue = Residue(res_id, resname, self.segid)
        self.chain.add(residue)
        self.residue = residue


class SloppyPDBIO(Bio.PDB.PDBIO):
    """PDBIO class that can deal with large pdb files as used in MD simulations

    - resSeq simply wrap and are printed modulo 10,000.
    - atom numbers wrap at 99,999 and are printed modulo 100,000

    """

    # The format string is derived from the PDB format as used in PDBIO.py
    # (has to be copied to the class because of the package layout it is not
    # externally accessible)
    _ATOM_FORMAT_STRING = (
        "%s%5i %-4s%c%3s %c%4i%c   " + "%8.3f%8.3f%8.3f%6.2f%6.2f      %4s%2s%2s\n"
    )

    def _get_atom_line(
        self,
        atom,
        hetfield,
        segid,
        atom_number,
        resname,
        resseq,
        icode,
        chain_id,
        element="  ",
        charge="  ",
    ):
        """Returns an ATOM string that is guaranteed to fit the ATOM format.

        - Resid (resseq) is wrapped (modulo 10,000) to fit into %4i (4I) format
        - Atom number (atom_number) is wrapped (modulo 100,000) to fit into
          %5i (5I) format

        """
        if hetfield != " ":
            record_type = "HETATM"
        else:
            record_type = "ATOM  "
        name = atom.get_fullname()
        altloc = atom.get_altloc()
        x, y, z = atom.get_coord()
        bfactor = atom.get_bfactor()
        occupancy = atom.get_occupancy()
        args = (
            record_type,
            atom_number % 100000,
            name,
            altloc,
            resname,
            chain_id,
            resseq % 10000,
            icode,
            x,
            y,
            z,
            occupancy,
            bfactor,
            segid,
            element,
            charge,
        )
        return self._ATOM_FORMAT_STRING % args


# convenience functions

sloppyparser = Bio.PDB.PDBParser(
    PERMISSIVE=True, structure_builder=SloppyStructureBuilder()
)


def get_structure(pdbfile, pdbid="system"):
    return sloppyparser.get_structure(pdbid, pdbfile)

另请参阅

SloppyStructureBuilder() 被用作一个小型 Python 模块 edPDB 的基础，用于编辑 PDB 文件以准备 MD 模拟。