{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Building the holo CCR2 MSM\n", "\n", "TICA, clustering, and model building on all the holo Anton data.\n", "\n", "- Bryn C. Taylor\n", "- Christopher T. Lee\n", "- Rommie E. Amaro" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from __future__ import print_function\n", "import pyemma\n", "pyemma.__version__\n", "import os\n", "%pylab inline\n", "matplotlib.rcParams.update({'font.size': 12})\n", "import pyemma.coordinates as coor\n", "import pyemma.msm as msm\n", "import pyemma.plots as mplt\n", "import numpy as np\n", "import pickle \n", "import mdtraj as md" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load coordinates and select features" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "topfile = '../holo_trajs/holo_proteinonly.pdb'\n", "from glob import glob\n", "traj_list = glob('../holo_trajs/*holo*') # these are the simulations used in the indivudual Holo and Holo analyses notebooks\n", "traj_list.sort()\n", "traj_list" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Choose residues to evaluate distances between" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "net = [(9,168), (191,107), (274,107), (274,191), (18,261),(89, 260), (89, 228), (260, 228), (18, 67), (67, 89), \n", " (67, 260), (67, 261), (19, 228), (215,50), (214,103), (214,50), (213,274), (274,50), (274,32), (274,103), (274,36), \n", " (36,213)]\n", "net" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# choose features to look at\n", "feat = coor.featurizer(topfile)\n", "feat.add_residue_mindist(residue_pairs=net)\n", "\n", "# describe the features\n", "feat.describe()\n", "print(feat.dimension())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# set input as features in trajectory \n", "inp = coor.source(traj_list, feat)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## TICA" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dt = .24\n", "lag = np.int(400 / dt) # determine this from the timescales v. lag time plot\n", "tica_obj_400 = coor.tica(inp, lag=lag, kinetic_map=True, var_cutoff=0.90, reversible=True)\n", "\n", "print('TICA Dimension with a lag of 400ns:', tica_obj_400.dimension())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tica_features_lag400 = tica_obj_400.feature_TIC_correlation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.savetxt('/msm_holo/tica_features_correlation_ticalag400.txt', tica_features_lag400)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "for i in range(2):\n", " if tica_obj_400.eigenvectors[0, i] > 0:\n", " tica_obj_400.eigenvectors[:, i] *= -1" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "Y = tica_obj_400.get_output() \n", "total_sim_Y = np.concatenate(Y, axis=0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## save all \n", "with open('/msm_holo/Y_ticalag400.txt', 'wb') as fp:\n", " pickle.dump(Y, fp)\n", "with open('/msm_holo/total_sim_Y_ticalag400.txt', 'wb') as fp:\n", " pickle.dump(total_sim_Y, fp)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "## open all\n", "with open('/msm_holo/Y_ticalag400.txt', 'rb') as fp:\n", " Y = pickle.load(fp)\n", "with open('/msm_holo/total_sim_Y_ticalag400.txt', 'rb') as fp:\n", " total_sim_Y = pickle.load(fp)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## K-means clustering" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "clustering_ticalag400 = coor.cluster_kmeans(Y, max_iter=50)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "dtrajs = clustering_ticalag400.dtrajs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## save all\n", "\n", "##clustercenters\n", "with open('/msm_holo/clustercenters_ticalag400_clustdefault_net2.txt', 'wb') as fp:\n", " pickle.dump(clustering_ticalag400.clustercenters, fp)\n", "\n", "##dtrajs\n", "with open('/msm_holo/dtrajs_ticalag400_clustdefault.txt', 'wb') as fp:\n", " pickle.dump(dtrajs, fp)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# open all \n", "\n", "# clustercenters \n", "with open('/msm_holo/clustercenters_ticalag400_clustdefault_net2.txt', 'rb') as fp:\n", " clustering_holo_clustercenters = pickle.load(fp)\n", "\n", "# dtrajs \n", "with open('/msm_holo/dtrajs_ticalag400_clustdefault.txt', 'rb') as fp:\n", " dtrajs_holo = pickle.load(fp)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(12,9), dpi=120)\n", "plt.title('Free Energy and Clusters of CCR2 Holo \\nProjected in tICA Coordinate Space', fontsize=16)\n", "\n", "plt.subplot(321)\n", "plt.xlabel('tIC 0', fontsize=14)\n", "plt.ylabel('tIC 1', fontsize=14)\n", "\n", "_, ax = mplt.plot_free_energy(total_sim_Y[:, 0], total_sim_Y[:, 1])\n", "ax.scatter(clustering_holo_clustercenters[:,0], clustering_holo_clustercenters[:,1], color='white', s=10)\n", "\n", "\n", "plt.subplot(322)\n", "#plt.title('Free Energy of CCR2 Holo \\nProjected in the First Two tICA Coordinates', fontsize=16)\n", "plt.xlabel('tIC 1', fontsize=14)\n", "plt.ylabel('tIC 2', fontsize=14)\n", "\n", "_, ax = mplt.plot_free_energy(total_sim_Y[:, 1], total_sim_Y[:, 2])\n", "ax.scatter(clustering_holo_clustercenters[:,1], clustering_holo_clustercenters[:,2], color='white', s=10)\n", "\n", "\n", "plt.subplot(323)\n", "#plt.title('Free Energy of CCR2 Holo \\nProjected in the First Two tICA Coordinates', fontsize=16)\n", "plt.xlabel('tIC 0', fontsize=14)\n", "plt.ylabel('tIC 2', fontsize=14)\n", "\n", "_, ax = mplt.plot_free_energy(total_sim_Y[:, 0], total_sim_Y[:, 2])\n", "ax.scatter(clustering_holo_clustercenters[:,0], clustering_holo_clustercenters[:,2], color='white', s=10)\n", "\n", "\n", "plt.subplot(324)\n", "#plt.title('Free Energy of CCR2 Holo \\nProjected in the First Two tICA Coordinates', fontsize=16)\n", "plt.xlabel('tIC 0', fontsize=14)\n", "plt.ylabel('tIC 3', fontsize=14)\n", "\n", "_, ax = mplt.plot_free_energy(total_sim_Y[:, 0], total_sim_Y[:, 3])\n", "ax.scatter(clustering_holo_clustercenters[:,0], clustering_holo_clustercenters[:,3], color='white', s=10)\n", "\n", "plt.subplot(325)\n", "#plt.title('Free Energy of CCR2 Holo \\nProjected in the First Two tICA Coordinates', fontsize=16)\n", "plt.xlabel('tIC 1', fontsize=14)\n", "plt.ylabel('tIC 3', fontsize=14)\n", "\n", "_, ax = mplt.plot_free_energy(total_sim_Y[:, 1], total_sim_Y[:, 3])\n", "ax.scatter(clustering_holo_clustercenters[:,1], clustering_holo_clustercenters[:,3], color='white', s=10)\n", "\n", "plt.subplot(326)\n", "#plt.title('Free Energy of CCR2 Holo \\nProjected in the First Two tICA Coordinates', fontsize=16)\n", "plt.xlabel('tIC 2', fontsize=14)\n", "plt.ylabel('tIC 3', fontsize=14)\n", "\n", "_, ax = mplt.plot_free_energy(total_sim_Y[:, 2], total_sim_Y[:, 3])\n", "ax.scatter(clustering_holo_clustercenters[:,2], clustering_holo_clustercenters[:,3], color='white', s=10)\n", "\n", "plt.tight_layout()\n", "#plt.savefig('/msm_holo/images/01_clustering_ticalag400_FE.png')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(12,9), dpi=300)\n", "plt.title('Free Energy and Clusters of CCR2 Holo \\nProjected in tICA Coordinate Space', fontsize=16)\n", "\n", "plt.xlabel('tIC 0', fontsize=14)\n", "plt.ylabel('tIC 1', fontsize=14)\n", "\n", "_, ax = mplt.plot_free_energy(total_sim_Y[:, 0], total_sim_Y[:, 1])\n", "ax.scatter(clustering_holo_clustercenters[:,0], clustering_holo_clustercenters[:,1], color='white', s=10)\n", "\n", "#plt.savefig('/msm_holo/images/01_clustering_ticalag400_FE_network_tic01.png')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Implied Timescale Plots" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "its_holo_bayes = msm.its(dtrajs_holo, errors='bayes', lags=500)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(12,9), dpi=300)\n", "dt = 0.24\n", "mplt.plot_implied_timescales(its_holo_bayes, units=r'$ns$', nits=10, linewidth=2, dt=dt)\n", "plt.title('Implied Timescale Plot with Errors: Holo CCR2')\n", "#plt.savefig('/msm_holo/images/02_its_bayes_ns.png')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Markov Model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dt =.24\n", "msm_lag = 200 # choose this from ITS plots\n", "M_holo_200 = msm.estimate_markov_model(dtrajs_holo, msm_lag, reversible=True)\n", "print('fraction of states used = ', M_holo_200.active_state_fraction)\n", "print('fraction of counts used = ', M_holo_200.active_count_fraction)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dt =.24\n", "msm_lag = 200 # choose this from ITS plots\n", "M_holo_200_bayes = msm.bayesian_markov_model(dtrajs_holo, msm_lag, reversible=True)\n", "print('fraction of states used = ', M_holo_200_bayes.active_state_fraction)\n", "print('fraction of counts used = ', M_holo_200_bayes.active_count_fraction)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#save as\n", "\n", "with open('/msm_holo/M_holo_emm_ticalag400ns_msmlag200steps.txt', 'wb') as fp:\n", " pickle.dump(M_holo_200, fp)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#open up \n", " \n", "with open ('/msm_holo/M_holo_emm_ticalag400ns_msmlag200steps.txt', 'rb') as fp:\n", " M_holo_200 = pickle.load(fp) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## CK Test" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "cktest7 = M_holo_200_bayes.cktest(7, err_est=True, mlags=4)\n", "mplt.plot_cktest(cktest7, figsize=(30,30), layout=(2,2), padding_top=0.1, y01=False, padding_between=0.3, units='steps')\n", "plt.savefig('/msm_holo/images/05_cktest_7.png')" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## HMM" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "nhidden=7\n", "\n", "estimator = MaximumLikelihoodHMSM(lag=M_holo_200.lagtime, \n", " nstates=nhidden, \n", " msm_init=M_holo_200, \n", " reversible=M_holo_200.reversible,\n", " dt_traj=M_holo_200.dt_traj,\n", " mincount_connectivity=0\n", " )\n", "estimator.estimate(M_holo_200.discrete_trajectories_full)\n", "hmm = estimator.model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "rates = hmm.P/48*1e6 # 1/ms\n", "print(\"Scaled Rates in 1/ms\")\n", "print(np.array2string(rates, precision=2))\n", "print(\"Probabilities\")\n", "print(np.array2string(hmm.P, precision=2))\n", "mplt.plot_markov_model(hmm, minflux=1e-25, arrow_labels=rates, arrow_label_format='%10.8f')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plot FE + Macrostates" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "proj_Y = total_sim_Y[:,[0,1]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "metastable_hmm_traj = [hmm.metastable_assignments[traj[:]] for traj in M_holo_200.discrete_trajectories_full]\n", "metastable_assign = np.concatenate(metastable_hmm_traj, axis=0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "counts = np.zeros(nhidden)\n", "macroCenters = np.zeros((nhidden,2))\n", "for i in np.arange(0,len(proj_Y)):\n", " state = metastable_assign[i]\n", " counts[state] += 1\n", " macroCenters[state] += proj_Y[i]\n", "macroCenters = macroCenters / counts[:,None]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "fig, ax = mplt.plot_free_energy(total_sim_Y[:, 0], total_sim_Y[:, 1], \n", " kT=0.592172822, \n", " cbar_label='Free Energy (kcal/mol)',\n", " avoid_zero_count=True\n", " )\n", "fig.set_size_inches((12,9))\n", "ax.set_title('''CCR2 Holo Macrostates''', fontsize=16)\n", "ax.set_ylabel('TIC 0', fontsize=14)\n", "ax.set_xlabel('TIC 1', fontsize=14)\n", "\n", "ax.scatter(macroCenters[:,0], macroCenters[:,1], marker='*', s=500, c='w')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "print(np.array2string(macroCenters))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Timescales" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "M_holo_200.timescales(10)/1000" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "hmm.timescales()/1000" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Make FE plot with all clusters in HMM states" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# clustercenters\n", " \n", "with open('/msm_holo/clustercenters_ticalag400_clustdefault_net2.txt', 'rb') as fp:\n", " clustering_holo_clustercenters = pickle.load(fp, encoding='latin1')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "hmm.metastable_sets\n", "\n", "state0 = hmm.metastable_sets[0]\n", "state1 = hmm.metastable_sets[1]\n", "state2 = hmm.metastable_sets[2]\n", "state3 = hmm.metastable_sets[3]\n", "state4 = hmm.metastable_sets[4]\n", "state5 = hmm.metastable_sets[5]\n", "state6 = hmm.metastable_sets[6]\n", "len(state6)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "fig, ax, misc = mplt.plot_free_energy(total_sim_Y[:, 0], total_sim_Y[:, 1], \n", " kT=0.592172822, \n", " cbar_label='Free Energy (kcal/mol)',\n", " avoid_zero_count=True,\n", " legacy=False\n", " )\n", "\n", "fig.set_size_inches((12,9))\n", "fig.set_dpi(300)\n", "# ax.set_title('''Holo''', fontsize=16)\n", "ax.set_ylabel('Holo TIC 1', fontsize=36)\n", "ax.set_xlabel('Holo TIC 0', fontsize=36)\n", "misc['cbar'].ax.tick_params(labelsize=36)\n", "misc['cbar'].set_label('Free Energy (kcal/mol)',fontsize=36)\n", "plt.tick_params(axis='both', which='major', labelsize=36)\n", "alpha=.8\n", "\n", "for k in range(22):\n", " ax.plot(clustering_holo_clustercenters[state0[k],0], clustering_holo_clustercenters[state0[k],1], linewidth=20, color='white', marker='o', markersize=10, alpha=alpha)\n", "for k in range(252):\n", " ax.plot(clustering_holo_clustercenters[state1[k],0], clustering_holo_clustercenters[state1[k],1], linewidth=20, color='red', marker='o', markersize=10, alpha=alpha)\n", "for k in range(54):\n", " ax.plot(clustering_holo_clustercenters[state2[k],0], clustering_holo_clustercenters[state2[k],1], linewidth=20, color='yellow', marker='o', markersize=10, alpha=alpha) \n", "for k in range(74):\n", " ax.plot(clustering_holo_clustercenters[state3[k],0], clustering_holo_clustercenters[state3[k],1], linewidth=20, color='cyan', marker='o', markersize=10, alpha=alpha) \n", "for k in range(239):\n", " ax.plot(clustering_holo_clustercenters[state4[k],0], clustering_holo_clustercenters[state4[k],1], linewidth=20, color='grey', marker='o', markersize=10, alpha=alpha) \n", "for k in range(114):\n", " ax.plot(clustering_holo_clustercenters[state5[k],0], clustering_holo_clustercenters[state5[k],1], linewidth=20, color='purple', marker='o', markersize=10, alpha=alpha) \n", "for k in range(35):\n", " ax.plot(clustering_holo_clustercenters[state6[k],0], clustering_holo_clustercenters[state6[k],1], linewidth=20, color='black', marker='o', markersize=10, alpha=alpha)\n", " \n", " \n", "ax.scatter(macroCenters[0,0], macroCenters[0,1], marker='o', s=1600, c='white', linewidths=2, edgecolors='black', zorder=99)\n", "ax.scatter(macroCenters[1,0], macroCenters[1,1], marker='o', s=1600, c='red', linewidths=2, edgecolors='black', zorder=99)\n", "ax.scatter(macroCenters[2,0], macroCenters[2,1], marker='o', s=1600, c='yellow', linewidths=2, edgecolors='black', zorder=99)\n", "ax.scatter(macroCenters[3,0], macroCenters[3,1], marker='o', s=1600, c='cyan', linewidths=2, edgecolors='black', zorder=99)\n", "ax.scatter(macroCenters[4,0], macroCenters[4,1], marker='o', s=1600, c='grey', linewidths=2, edgecolors='black', zorder=99)\n", "ax.scatter(macroCenters[5,0], macroCenters[5,1], marker='o', s=1600, c='purple', linewidths=2, edgecolors='black', zorder=99)\n", "ax.scatter(macroCenters[6,0], macroCenters[6,1], marker='o', s=1600, c='black', linewidths=2, edgecolors='white', zorder=99)\n", "\n", "\n", "fig.savefig('/apo_holo_msm_images/holo_hmm.eps', dpi=300, format='eps')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 1 }