Source code for stateinterpreter.metastable

import pandas as pd
import numpy as np
from .utils.numerical_utils import gaussian_kde
from ._configs import *
import sys

__all__ = ["identify_metastable_states", "approximate_FES"]

[docs]def identify_metastable_states( colvar, selected_cvs, kBT, bandwidth, logweights=None, fes_cutoff=None, gradient_descent_iterates = 0, sort_minima_by = 'cvs_grid', optimizer_kwargs=dict(), ): """Label configurations based on free energy Parameters ---------- colvar : Pandas dataframe ### selected_cvs : list of strings Names of the collective variables used for clustering kBT : scalar Temperature bandwidth : scalar Bandwidth method for FES calculations logweights : pandas.DataFrame, np.array or string , optional Logweights used for FES calculation, by default None fes_cutoff : float, optional Cutoff used to select only low free-energy configurations, if None fes_cutoff is 2k_BT sort_minima_by : string, optional Sort labels based on `energy`, `cvs`, or `cvs_grid` values, by default `cvs_grid`. optimizer_kwargs : optional Arguments for optimizer, by default dict(). Possible kwargs are: (int) num_init: number of initialization point, (int) decimals_tolerance: number of decimals to retain to identify unique minima, (str) sampling: sampling scheme. Accepted strings are 'data_driven' or 'uniform'. """ #Adimensional fes_cutoff if fes_cutoff is None: fes_cutoff = 2 #kBT else: fes_cutoff=fes_cutoff/kBT # Retrieve logweights if logweights is not None: assert ( isinstance(logweights,np.ndarray) ), 'Logweights must be a numpy array.' # Compute KDE empirical_centers = colvar[selected_cvs].to_numpy() KDE = gaussian_kde(empirical_centers,bandwidth=bandwidth,logweights=logweights) if __DEV__: print("DEV >>> Finding Local Minima") minima = KDE.local_minima(**optimizer_kwargs) # sort minima based on first CV if sort_minima_by == 'energy': f_min = np.asarray([-KDE.logpdf(x) for x in minima]) sortperm = np.argsort(f_min, axis=0) minima = minima[sortperm] elif sort_minima_by == 'cvs' : # sort first by 1st cv, then 2nd, ... x = minima minima = x [ np.lexsort( np.round(np.flipud(x.T),2) ) ] elif sort_minima_by == 'cvs_grid' : bounds = [(x.min(), x.max()) for x in KDE.dataset.T] # sort based on a binning of the cvs (10 bins per each direction), # along 1st cv, then 2nd, ... x = minima y = (x - [ bound[0] for bound in bounds ]) y /= np.asarray( [ bound[1]-bound[0] for bound in bounds ]) / 10 minima = x [ np.lexsort( np.round(np.flipud(y.T),0) ) ] else: raise KeyError(f'Key {sort_minima_by} not allowed. Valid values: "energy","cvs","cvs_grid".') # Assign basins and select based on FES cutoff basins = _basin_selection(KDE, minima, fes_cutoff, gradient_descent_iterates) n_basins = len(basins['labels'].unique()) print(f"Found {n_basins} local minima with selected populations:") for idx in range(n_basins): l = len(basins.loc[ (basins['labels'] == idx) & (basins['selection'] == True)]) print(f"\tBasin {idx} -> {l} configurations.") return basins
def _basin_selection( KDE, minima, fes_cutoff, gradient_descent_iterates ): if __DEV__: print("DEV >>> Basin Assignment") pts = np.copy(KDE.dataset) v = np.zeros_like(pts) beta = 0.9 #Default learning_rate = np.diag(np.diag(KDE.inv_bwidth)**-1)*0.5 for _ in range(gradient_descent_iterates): v *= beta v += -(1 - beta)*KDE.grad(pts, logpdf=True) pts -= np.dot(v,learning_rate) norms = np.linalg.norm((pts[:,np.newaxis,:] - minima), axis=2) classes = np.argmin(norms, axis=1) fes_at_minima = - KDE.logpdf(minima) if len(minima) == 1: ref_fes = fes_at_minima else: ref_fes = np.asarray([fes_at_minima[idx] for idx in classes]) fes_pts = - KDE.logpdf(KDE.dataset) mask = (fes_pts - ref_fes) < fes_cutoff df = pd.DataFrame(data=classes, columns=["labels"]) df["selection"] = mask return df
[docs]def approximate_FES( colvar, bandwidth, selected_cvs=None, kBT=2.5, logweights=None ): """Approximate Free Energy Surface (FES) in the space of selected_cvs through Gaussian Kernel Density Estimation Args: bandwidth (scalar, vector or matrix): selected_cvs (numpy.ndarray or pd.Dataframe): List of sampled collective variables with dimensions [num_timesteps, num_CVs] kBT (scalar): Temperature logweights (arraylike log weights, optional): Logarithm of the weights. Defaults to None (uniform weights). Returns: function: Function approximating the free Energy Surface """ if __DEV__: print("DEV >>> Approximating FES") if logweights is not None: assert ( isinstance(logweights,np.ndarray) ), 'Logweights must be a numpy array.' empirical_centers = colvar[selected_cvs] if selected_cvs is not None else colvar empirical_centers = empirical_centers.to_numpy() KDE = gaussian_kde(empirical_centers, bandwidth,logweights=logweights) return lambda x: -kBT*KDE.logpdf(x)