from __future__ import print_function
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

%matplotlib inline

import pytraj as pt

import scipy
import scipy.cluster.hierarchy

# load data
traj = pt.iterload('tz2.nc', 'tz2.parm7')
traj

pytraj.TrajectoryIterator, 101 frames: 
Size: 0.000503 (GB)
<Topology: 223 atoms, 13 residues, 1 mols, non-PBC>

# calculate pairwise rmsd with `autoimage=True`
# generally we only need to cluster a subset of atoms.
# cluster for 'CA' atoms

distances = pt.pairwise_rmsd(traj(autoimage=True), mask='@CA')

# use `scipy` to perform clustering
linkage = scipy.cluster.hierarchy.ward(distances)

scipy.cluster.hierarchy.dendrogram(linkage, no_labels=True, count_sort='descendent')
None

# cluster for all atoms but H

distances = pt.pairwise_rmsd(traj(autoimage=True), mask='!@H=')
linkage = scipy.cluster.hierarchy.ward(distances)
scipy.cluster.hierarchy.dendrogram(linkage, no_labels=True, count_sort='descendent')
None

PCA analysis with sklearn (adapted from `mdtraj` and cpptraj tutorial)¶

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

%matplotlib inline
from __future__ import print_function
%config InlineBackend.figure_format = 'retina'  # high resolution
import matplotlib

import pytraj as pt
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from pytraj.plot import show_config

# we use `load` method to load all data to memory. This is good for small data size.
# use `pytraj.iterload` for out-of-core traj.

traj = pt.load('tz2.nc', 'tz2.parm7')
traj

pytraj.Trajectory, 101 frames: 
Size: 0.000503 (GB)
<Topology: 223 atoms, 13 residues, 1 mols, non-PBC>

pca = PCA(n_components=2)
pca

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

# superpose to 1st frame
pt.superpose(traj, ref=0, mask='!@H=')

pytraj.Trajectory, 101 frames: 
Size: 0.000503 (GB)
<Topology: 223 atoms, 13 residues, 1 mols, non-PBC>

# create average structure

avg = pt.mean_structure(traj)
avg

<Frame with 223 atoms>

# superpose all structures to average frame
pt.superpose(traj, ref=avg, mask='!@H=')

pytraj.Trajectory, 101 frames: 
Size: 0.000503 (GB)
<Topology: 223 atoms, 13 residues, 1 mols, non-PBC>

# perform PCA calculation and get transformed coords
# we need to reshape 3D traj.xyz array to 2D to make sklearn happy
# make a new traj by stripping all H atoms
traj_new = traj['!@H=']
xyz_2d = traj_new.xyz.reshape(traj_new.n_frames, traj_new.n_atoms * 3)
print(xyz_2d.shape) # (n_frames, n_dimensions)

(101, 351)

reduced_cartesian = pca.fit_transform(xyz_2d)
print(reduced_cartesian.shape) # (n_frames, n_dimensions)

(101, 2)

# ignore warning
import warnings
warnings.filterwarnings('ignore')

plt.figure()
plt.scatter(reduced_cartesian[:, 0], reduced_cartesian[:,1], marker='o', c=range(traj_new.n_frames), alpha=0.5)
plt.xlabel('PC1')
plt.ylabel('PC2')
cbar = plt.colorbar()
cbar.set_label('frame #')

Compare to cpptraj data¶

note: stop here if you do not care (a bit compilicated code)

# cpptraj

# copy from Amber15 manual (page 619)
command = '''
# Step one. Generate average structure.
# RMS-Fit to first frame to remove global translation/rotation.
parm tz2.parm7
trajin tz2.nc
rms first !@H=
average crdset AVG
run
# Step two. RMS-Fit to average structure. Calculate covariance matrix.
# Save the fit coordinates.
rms ref AVG !@H=
matrix covar name MyMatrix !@H=
createcrd CRD1
run
# Step three. Diagonalize matrix.
runanalysis diagmatrix MyMatrix vecs 2 name MyEvecs
# Step four. Project saved fit coordinates along eigenvectors 1 and 2
crdaction CRD1 projection evecs MyEvecs !@H= out project.dat beg 1 end 2
'''

state = pt.datafiles.load_cpptraj_state

state = pt.datafiles.load_cpptraj_state(command)
# tell 'run' to perform all calculation
state.run()

CpptrajState, include:
<datasetlist: 9 datasets>

# get data
state.data

<pytraj.datasets.CpptrajDatasetList - 9 datasets>

print([dset.key for dset in state.data])
print(state.data['MyMatrix'].values.shape)

['tz2.parm7', 'RMSD_00001', 'AVG', 'RMSD_00003', 'MyMatrix', 'CRD1', 'MyEvecs', 'Mode1', 'Mode2']
(351, 351)

# reduced_cartesian corresponds to dataset with names of 'Mode1', 'Mode2'
mode_0, mode_1 = -state.data['Mode1'].values, -state.data['Mode2'].values
# mode_0, mode_1 = state.data['Mode1'].values, state.data['Mode2'].values

# plot: cpptraj
fig = plt.figure()
ax_0 = fig.add_subplot(211)
ax_0.scatter(mode_0, mode_1, marker='o', c=range(traj.n_frames), alpha=0.5)
ax_0.set_xlabel('PC1')
ax_0.set_ylabel('PC2')
ax_0.set_xlim([-60, 80])
ax_0.set_ylim([-40, 40])
ax_0.set_title('cpptraj-flip')
ax_0.set_yticks([-40, -20, 0, 20, 40])

# plot: sklearn
fig = plt.figure()
ax_1 = fig.add_subplot(212)
ax_1.scatter(reduced_cartesian[:, 0], reduced_cartesian[:,1], marker='o', c=range(traj.n_frames), alpha=0.5)
ax_1.set_xlabel('PC1')
ax_1.set_ylabel('PC2')
ax_1.set_yticks([-40, -20, 0, 20, 40])
ax_1.set_title('sklearn')

<matplotlib.text.Text at 0x10f0f5128>

print('sklearn \n')
print(reduced_cartesian[:, 0], reduced_cartesian[:, 1])

print('\ncpptraj\n') # flip sign, why?
print(mode_0, mode_1)

sklearn 

[ -4.93425114 -13.80002313 -20.61605771 ...,  57.92280397  61.25728693
  52.85142033] [  4.033336    -6.91324506 -14.53991347 ...,  -6.75793613   2.10867188
  -3.60922864]

cpptraj

[ -4.93425131 -13.80002308 -20.61605835 ...,  57.92280579  61.25728607
  52.85142136] [ -4.03333616   6.9132452   14.53991318 ...,   6.757936    -2.1086719
   3.60922861]

Tutorials adapted from MDtraj ¶

Clustering with scipy ¶

Principal component analysis with sklearn ¶

PCA analysis with sklearn (adapted from `mdtraj` and cpptraj tutorial)¶

Compare to cpptraj data¶

Tutorials adapted from MDtraj¶

Clustering with scipy¶

Principal component analysis with sklearn¶

PCA analysis with sklearn (adapted from mdtraj and cpptraj tutorial)¶

Compare to cpptraj data¶

Tutorials adapted from MDtraj ¶

Clustering with scipy ¶

Principal component analysis with sklearn ¶

PCA analysis with sklearn (adapted from `mdtraj` and cpptraj tutorial)¶