Source code for stateinterpreter.metastable
import pandas as pd
import numpy as np
from .utils.numerical_utils import gaussian_kde
from ._configs import *
import sys
__all__ = ["identify_metastable_states", "approximate_FES"]
[docs]def identify_metastable_states(
colvar,
selected_cvs,
kBT,
bandwidth,
logweights=None,
fes_cutoff=None,
gradient_descent_iterates = 0,
sort_minima_by = 'cvs_grid',
optimizer_kwargs=dict(),
):
"""Label configurations based on free energy
Parameters
----------
colvar : Pandas dataframe
###
selected_cvs : list of strings
Names of the collective variables used for clustering
kBT : scalar
Temperature
bandwidth : scalar
Bandwidth method for FES calculations
logweights : pandas.DataFrame, np.array or string , optional
Logweights used for FES calculation, by default None
fes_cutoff : float, optional
Cutoff used to select only low free-energy configurations, if None fes_cutoff is 2k_BT
sort_minima_by : string, optional
Sort labels based on `energy`, `cvs`, or `cvs_grid` values, by default `cvs_grid`.
optimizer_kwargs : optional
Arguments for optimizer, by default dict(). Possible kwargs are:
(int) num_init: number of initialization point,
(int) decimals_tolerance: number of decimals to retain to identify unique minima,
(str) sampling: sampling scheme. Accepted strings are 'data_driven' or 'uniform'.
"""
#Adimensional fes_cutoff
if fes_cutoff is None:
fes_cutoff = 2 #kBT
else:
fes_cutoff=fes_cutoff/kBT
# Retrieve logweights
if logweights is not None:
assert ( isinstance(logweights,np.ndarray) ), 'Logweights must be a numpy array.'
# Compute KDE
empirical_centers = colvar[selected_cvs].to_numpy()
KDE = gaussian_kde(empirical_centers,bandwidth=bandwidth,logweights=logweights)
if __DEV__:
print("DEV >>> Finding Local Minima")
minima = KDE.local_minima(**optimizer_kwargs)
# sort minima based on first CV
if sort_minima_by == 'energy':
f_min = np.asarray([-KDE.logpdf(x) for x in minima])
sortperm = np.argsort(f_min, axis=0)
minima = minima[sortperm]
elif sort_minima_by == 'cvs' :
# sort first by 1st cv, then 2nd, ...
x = minima
minima = x [ np.lexsort( np.round(np.flipud(x.T),2) ) ]
elif sort_minima_by == 'cvs_grid' :
bounds = [(x.min(), x.max()) for x in KDE.dataset.T]
# sort based on a binning of the cvs (10 bins per each direction),
# along 1st cv, then 2nd, ...
x = minima
y = (x - [ bound[0] for bound in bounds ])
y /= np.asarray( [ bound[1]-bound[0] for bound in bounds ]) / 10
minima = x [ np.lexsort( np.round(np.flipud(y.T),0) ) ]
else:
raise KeyError(f'Key {sort_minima_by} not allowed. Valid values: "energy","cvs","cvs_grid".')
# Assign basins and select based on FES cutoff
basins = _basin_selection(KDE, minima, fes_cutoff, gradient_descent_iterates)
n_basins = len(basins['labels'].unique())
print(f"Found {n_basins} local minima with selected populations:")
for idx in range(n_basins):
l = len(basins.loc[ (basins['labels'] == idx) & (basins['selection'] == True)])
print(f"\tBasin {idx} -> {l} configurations.")
return basins
def _basin_selection(
KDE, minima, fes_cutoff, gradient_descent_iterates
):
if __DEV__:
print("DEV >>> Basin Assignment")
pts = np.copy(KDE.dataset)
v = np.zeros_like(pts)
beta = 0.9 #Default
learning_rate = np.diag(np.diag(KDE.inv_bwidth)**-1)*0.5
for _ in range(gradient_descent_iterates):
v *= beta
v += -(1 - beta)*KDE.grad(pts, logpdf=True)
pts -= np.dot(v,learning_rate)
norms = np.linalg.norm((pts[:,np.newaxis,:] - minima), axis=2)
classes = np.argmin(norms, axis=1)
fes_at_minima = - KDE.logpdf(minima)
if len(minima) == 1:
ref_fes = fes_at_minima
else:
ref_fes = np.asarray([fes_at_minima[idx] for idx in classes])
fes_pts = - KDE.logpdf(KDE.dataset)
mask = (fes_pts - ref_fes) < fes_cutoff
df = pd.DataFrame(data=classes, columns=["labels"])
df["selection"] = mask
return df
[docs]def approximate_FES(
colvar, bandwidth, selected_cvs=None, kBT=2.5, logweights=None
):
"""Approximate Free Energy Surface (FES) in the space of selected_cvs through Gaussian Kernel Density Estimation
Args:
bandwidth (scalar, vector or matrix):
selected_cvs (numpy.ndarray or pd.Dataframe): List of sampled collective variables with dimensions [num_timesteps, num_CVs]
kBT (scalar): Temperature
logweights (arraylike log weights, optional): Logarithm of the weights. Defaults to None (uniform weights).
Returns:
function: Function approximating the free Energy Surface
"""
if __DEV__:
print("DEV >>> Approximating FES")
if logweights is not None:
assert ( isinstance(logweights,np.ndarray) ), 'Logweights must be a numpy array.'
empirical_centers = colvar[selected_cvs] if selected_cvs is not None else colvar
empirical_centers = empirical_centers.to_numpy()
KDE = gaussian_kde(empirical_centers, bandwidth,logweights=logweights)
return lambda x: -kBT*KDE.logpdf(x)