Examples » RMSD/RMSF

import os
from pyxmolpp2 import calc_rmsd, calc_alignment, PdbFile, aName, Atom
import numpy as np

Let's create a frame to work with

pdb_filename = os.path.join(os.environ["TEST_DATA_PATH"], "pdb/rcsb/5BMG.pdb")
pdb_file = PdbFile(pdb_filename)
frame = pdb_file.frames()[0]

Number of residues in molecules must be same (strip water, ions, etc.)

N = 0
for mol in frame.molecules:
    if mol.size != frame.molecules[0].size:
        break
    N += 1

Print RMSD matrix for all deposited chains:

print(' '.join([" " * 6] + [f"{i:5d}" for i in range(N)]))
for i in range(0, N):
    chain_i_ca = frame.molecules[i].atoms.filter(aName == "CA")
    print(f"{i:-5d}:", end=" ")
    for j in range(0, i + 1):
        chain_j_ca = frame.molecules[j].atoms.filter(aName == "CA")

        alignment = chain_i_ca.alignment_to(chain_j_ca)
        crd = chain_i_ca.coords.values.copy()
        crd = (crd @ alignment.matrix3d().T) + alignment.vector3d().values

        rmsd = calc_rmsd(chain_j_ca.coords.values, crd)

        print(f"{rmsd:5.1f}", end=" ")
    print("")
           0     1     2     3     4     5     6     7
    0:   0.0 
    1:   0.4   0.0 
    2:   0.4   0.4   0.0 
    3:   0.4   0.4   0.3   0.0 
    4:   0.4   0.3   0.3   0.3   0.0 
    5:   0.3   0.4   0.3   0.4   0.4   0.0 
    6:   0.3   0.4   0.3   0.4   0.3   0.4   0.0 
    7:   0.4   0.4   0.3   0.3   0.3   0.3   0.4   0.0

Calculate RMSF per residue

first_mol_ca = frame.molecules[0].atoms.filter(aName == "CA")

# initialize average coordinates with (0,0,0)
avg_coords = np.zeros((first_mol_ca.size, 3))

# calculate average coordinates across N frames
for mol in frame.molecules[:N]:
    chain_i_ca = mol.atoms.filter(aName == "CA")
    chain_i_ca.coords.apply(chain_i_ca.alignment_to(first_mol_ca))

    avg_coords += chain_i_ca.coords.values

avg_coords /= N

# align to average coordinates
for mol in frame.molecules[:N]:
    chain_i_ca = mol.atoms.filter(aName == "CA")
    chain_i_ca.coords.apply(calc_alignment(ref=avg_coords, var=chain_i_ca.coords.values))

# calculate per residue RMSF

import numpy as np

rmsf = np.zeros((first_mol_ca.size,))
for mol in frame.molecules[:N]:
    chain_i_ca = mol.atoms.filter(aName == "CA")
    for k, a in enumerate(chain_i_ca):
        rmsf[k] += np.linalg.norm(a.r.values - avg_coords[k])

rmsf = np.sqrt(rmsf / N)

Now we can plot RMSF

import matplotlib.pyplot as plt
from matplotlib.ticker import Formatter, IndexLocator

plt.style.use(MCSS_MPL_DARK)

plt.figure(figsize=(5, 3))
plt.step(range(len(rmsf)), rmsf, where="mid")
plt.fill_between(range(len(rmsf)), rmsf, step="mid", alpha=0.1)
plt.ylabel("RMSF, $\AA$")
plt.grid(color="#CCCCCC", lw=0.1)


class ResidueFormatter(Formatter):
    def __call__(self, x, pos=None):
        from Bio.PDB.Polypeptide import three_to_one
        if x < 0 or x >= first_mol_ca.size:
            return ''
        a = first_mol_ca[int(x)]
        if a.residue.id.serial % 5 == 0:
            return "%s\n%d" % (three_to_one(a.residue.name), a.residue.id.serial)
        else:
            return "%s" % (three_to_one(a.residue.name))

plt.gca().xaxis.set_minor_locator(IndexLocator(base=1, offset=-1))
plt.gca().xaxis.set_major_locator(IndexLocator(base=5, offset=-1))
plt.gca().xaxis.set_minor_formatter(ResidueFormatter())
plt.gca().xaxis.set_major_formatter(ResidueFormatter())
plt.gca().tick_params(axis='both', which='both', labelsize=6)
plt.gca().tick_params(axis='both', which='minor', pad=4, length=2)
plt.gca().tick_params(axis='both', which='major', pad=2, length=4)