Mg2SiO4のolivineとspinelのMDスナップショットのdescriptorを用いた比較¶
- olivine, spinelともに 1000K, 20GPa のMDからのスナップショット(112原子 x 10構造)
- Behler-Parrinello型symmetry function (BPSF)
ndim
次元のデータ.これは少し説明が必要だが,BPSFを大量に用意して,Orthogonal matching pursuit (OMP) という手法で次元削減したもの. - pmdからBPSF descriptorを出力したもの(
out.desc.gsf
)を読み込んで pandas.DataFrame に変換し,PCAにかける.
In [5]:
%matplotlib inline
import os, sys
import glob
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import nappy
import pandas as pd
np.set_printoptions(precision=3, floatmode='fixed')
sns.set_theme(context='talk', style='ticks')
In [2]:
# olivine, spinel の各ディレクトリ内のout.desc.gsf_### ファイルリストを取得
gsfs_oli = glob.glob('mkdesc_olivine/out.desc.gsf_[1-9]*')
gsfs_spi = glob.glob('mkdesc_spinel/out.desc.gsf_[1-9]*')
print(gsfs_oli, gsfs_spi)
['mkdesc_olivine/out.desc.gsf_4000', 'mkdesc_olivine/out.desc.gsf_1000', 'mkdesc_olivine/out.desc.gsf_5000', 'mkdesc_olivine/out.desc.gsf_8000', 'mkdesc_olivine/out.desc.gsf_9000', 'mkdesc_olivine/out.desc.gsf_2000', 'mkdesc_olivine/out.desc.gsf_10000', 'mkdesc_olivine/out.desc.gsf_6000', 'mkdesc_olivine/out.desc.gsf_7000', 'mkdesc_olivine/out.desc.gsf_3000'] ['mkdesc_spinel/out.desc.gsf_4000', 'mkdesc_spinel/out.desc.gsf_1000', 'mkdesc_spinel/out.desc.gsf_5000', 'mkdesc_spinel/out.desc.gsf_8000', 'mkdesc_spinel/out.desc.gsf_9000', 'mkdesc_spinel/out.desc.gsf_2000', 'mkdesc_spinel/out.desc.gsf_10000', 'mkdesc_spinel/out.desc.gsf_6000', 'mkdesc_spinel/out.desc.gsf_7000', 'mkdesc_spinel/out.desc.gsf_3000']
In [7]:
ndim = 19
In [3]:
# descファイルを読み込む関数
def read_desc(fname):
import numpy as np
with open(fname, 'r') as f:
lines = f.readlines()
natm = -1
nsf = -1
desc = None
inc = -1
for i,l in enumerate(lines):
data = l.split()
if len(data) == 0:
continue
if l[0] == '#':
if data[1] == 'natm:':
natm = int(data[2])
elif data[1] == 'nsf:':
nsf = int(data[2])
if natm < 0 or nsf < 0:
continue
else:
if desc is None:
desc = np.zeros((natm,nsf))
inc = 0
continue
ds = [ float(d) for d in data ]
desc[inc,:] = ds[:]
inc += 1
return desc
In [8]:
# olivine構造のDataFrame作成
nsys = nappy.io.read('pmdini_olivine')
species = nsys.get_symbols()
data_oli = {'data':np.zeros([0,0]), 'species':np.zeros([0])}
df_oli = None
for gsf in gsfs_oli:
print(gsf)
di = read_desc(gsf)
df1 = pd.DataFrame(di, columns=[f'{i+1:d}' for i in range(ndim)])
df2 = pd.DataFrame({'species':species,
'struct':['olivine' for i,s in enumerate(species)]})
df = pd.concat([df1,df2], axis=1)
if df_oli is None:
df_oli = df
else:
df_oli = pd.concat([df_oli, df])
df_oli
mkdesc_olivine/out.desc.gsf_4000 mkdesc_olivine/out.desc.gsf_1000 mkdesc_olivine/out.desc.gsf_5000 mkdesc_olivine/out.desc.gsf_8000 mkdesc_olivine/out.desc.gsf_9000 mkdesc_olivine/out.desc.gsf_2000 mkdesc_olivine/out.desc.gsf_10000 mkdesc_olivine/out.desc.gsf_6000 mkdesc_olivine/out.desc.gsf_7000 mkdesc_olivine/out.desc.gsf_3000
Out[8]:
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | ... | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | species | struct | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.000539 | 0.62871 | 0.101800 | 0.000263 | 0.72694 | 0.063504 | 0.80328 | 2.7978 | 1.18750 | 0.002173 | ... | 0.049897 | 0.092742 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | Mg | olivine |
1 | 0.000715 | 0.63984 | 0.118260 | 0.000335 | 0.82090 | 0.076565 | 0.78271 | 2.9897 | 1.51180 | 0.005989 | ... | 0.064914 | 0.120080 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | Mg | olivine |
2 | 0.000390 | 0.61289 | 0.081793 | 0.000200 | 0.63668 | 0.046893 | 0.77756 | 2.5789 | 1.07860 | 0.002967 | ... | 0.034090 | 0.067948 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | Mg | olivine |
3 | 0.000531 | 0.63249 | 0.101910 | 0.000095 | 0.73328 | 0.032303 | 0.82981 | 2.7731 | 1.10470 | 0.001249 | ... | 0.048499 | 0.086804 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | Mg | olivine |
4 | 0.000120 | 0.57602 | 0.034848 | 0.000027 | 0.42686 | 0.010140 | 0.74274 | 2.7256 | 1.18550 | 0.007026 | ... | 0.040301 | 0.083489 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | Mg | olivine |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
107 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.34932 | 1.4168 | 0.63966 | 0.001363 | ... | 0.000000 | 0.000000 | 0.024149 | 0.005478 | 0.011140 | 0.020897 | 0.020047 | 0.033063 | O | olivine |
108 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.36231 | 1.5187 | 0.78218 | 0.002611 | ... | 0.000000 | 0.000000 | 0.006454 | 0.019646 | 0.029715 | 0.082459 | 0.002062 | 0.011103 | O | olivine |
109 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.41487 | 1.3286 | 0.52607 | 0.000922 | ... | 0.000000 | 0.000000 | 0.001635 | 0.012877 | 0.018680 | 0.081466 | 0.000908 | 0.007234 | O | olivine |
110 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.40870 | 1.3452 | 0.49501 | 0.000348 | ... | 0.000000 | 0.000000 | 0.003997 | 0.010358 | 0.016184 | 0.066220 | 0.001656 | 0.009500 | O | olivine |
111 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.38783 | 1.4806 | 0.70626 | 0.001703 | ... | 0.000000 | 0.000000 | 0.004800 | 0.017835 | 0.026778 | 0.091580 | 0.001917 | 0.010966 | O | olivine |
1120 rows × 21 columns
In [10]:
# spinel構造のDataFrame作成
nsys = nappy.io.read('pmdini_spinel')
species = nsys.get_symbols()
df_spi = None
for gsf in gsfs_spi:
print(gsf)
di = read_desc(gsf)
df1 = pd.DataFrame(di, columns=[f'{i+1:d}' for i in range(ndim)])
df2 = pd.DataFrame({'species':species,
'struct':['spinel' for i,s in enumerate(species)]})
df = pd.concat([df1,df2], axis=1)
if df_spi is None:
df_spi = df
else:
df_spi = pd.concat([df_spi, df])
df_spi
mkdesc_spinel/out.desc.gsf_4000 mkdesc_spinel/out.desc.gsf_1000 mkdesc_spinel/out.desc.gsf_5000 mkdesc_spinel/out.desc.gsf_8000 mkdesc_spinel/out.desc.gsf_9000 mkdesc_spinel/out.desc.gsf_2000 mkdesc_spinel/out.desc.gsf_10000 mkdesc_spinel/out.desc.gsf_6000 mkdesc_spinel/out.desc.gsf_7000 mkdesc_spinel/out.desc.gsf_3000
Out[10]:
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | ... | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | species | struct | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.000024 | 0.41971 | 0.009505 | 0.000580 | 1.0880 | 0.13376 | 0.82408 | 2.8157 | 1.21900 | 0.002809 | ... | 0.045968 | 0.090854 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | Mg | spinel |
1 | 0.000018 | 0.43576 | 0.007852 | 0.000592 | 1.0729 | 0.13182 | 0.81081 | 2.8603 | 1.27690 | 0.002923 | ... | 0.047791 | 0.097070 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | Mg | spinel |
2 | 0.000014 | 0.38711 | 0.006169 | 0.000537 | 1.1061 | 0.12992 | 0.80708 | 2.8047 | 1.19560 | 0.001932 | ... | 0.047913 | 0.093557 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | Mg | spinel |
3 | 0.000016 | 0.41068 | 0.006867 | 0.000738 | 1.1448 | 0.16176 | 0.85449 | 2.8887 | 1.31860 | 0.003395 | ... | 0.049827 | 0.100510 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | Mg | spinel |
4 | 0.000022 | 0.47263 | 0.009383 | 0.000506 | 1.0985 | 0.12513 | 0.80757 | 2.8497 | 1.32410 | 0.004889 | ... | 0.048869 | 0.100420 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | Mg | spinel |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
107 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.0000 | 0.00000 | 0.44368 | 1.2912 | 0.44298 | 0.000349 | ... | 0.000000 | 0.000000 | 0.000268 | 0.014644 | 0.019666 | 0.081814 | 0.000259 | 0.005240 | O | spinel |
108 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.0000 | 0.00000 | 0.38360 | 1.4027 | 0.63210 | 0.001478 | ... | 0.000000 | 0.000000 | 0.001865 | 0.017336 | 0.024916 | 0.079120 | 0.001912 | 0.009930 | O | spinel |
109 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.0000 | 0.00000 | 0.44118 | 1.4429 | 0.61205 | 0.000811 | ... | 0.000000 | 0.000000 | 0.001185 | 0.019475 | 0.027238 | 0.085882 | 0.000710 | 0.007248 | O | spinel |
110 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.0000 | 0.00000 | 0.38545 | 1.5511 | 0.83206 | 0.002816 | ... | 0.000000 | 0.000000 | 0.004023 | 0.023825 | 0.035074 | 0.098466 | 0.002795 | 0.012908 | O | spinel |
111 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.0000 | 0.00000 | 0.40504 | 1.5743 | 0.86869 | 0.003303 | ... | 0.000000 | 0.000000 | 0.007300 | 0.020957 | 0.032637 | 0.093801 | 0.002423 | 0.013150 | O | spinel |
1120 rows × 21 columns
In [11]:
# 全て結合しての1つのDF作成
dfs = pd.concat([df_oli, df_spi], axis=0)
dfs
Out[11]:
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | ... | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | species | struct | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.000539 | 0.62871 | 0.101800 | 0.000263 | 0.72694 | 0.063504 | 0.80328 | 2.7978 | 1.18750 | 0.002173 | ... | 0.049897 | 0.092742 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | Mg | olivine |
1 | 0.000715 | 0.63984 | 0.118260 | 0.000335 | 0.82090 | 0.076565 | 0.78271 | 2.9897 | 1.51180 | 0.005989 | ... | 0.064914 | 0.120080 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | Mg | olivine |
2 | 0.000390 | 0.61289 | 0.081793 | 0.000200 | 0.63668 | 0.046893 | 0.77756 | 2.5789 | 1.07860 | 0.002967 | ... | 0.034090 | 0.067948 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | Mg | olivine |
3 | 0.000531 | 0.63249 | 0.101910 | 0.000095 | 0.73328 | 0.032303 | 0.82981 | 2.7731 | 1.10470 | 0.001249 | ... | 0.048499 | 0.086804 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | Mg | olivine |
4 | 0.000120 | 0.57602 | 0.034848 | 0.000027 | 0.42686 | 0.010140 | 0.74274 | 2.7256 | 1.18550 | 0.007026 | ... | 0.040301 | 0.083489 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | Mg | olivine |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
107 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.44368 | 1.2912 | 0.44298 | 0.000349 | ... | 0.000000 | 0.000000 | 0.000268 | 0.014644 | 0.019666 | 0.081814 | 0.000259 | 0.005240 | O | spinel |
108 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.38360 | 1.4027 | 0.63210 | 0.001478 | ... | 0.000000 | 0.000000 | 0.001865 | 0.017336 | 0.024916 | 0.079120 | 0.001912 | 0.009930 | O | spinel |
109 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.44118 | 1.4429 | 0.61205 | 0.000811 | ... | 0.000000 | 0.000000 | 0.001185 | 0.019475 | 0.027238 | 0.085882 | 0.000710 | 0.007248 | O | spinel |
110 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.38545 | 1.5511 | 0.83206 | 0.002816 | ... | 0.000000 | 0.000000 | 0.004023 | 0.023825 | 0.035074 | 0.098466 | 0.002795 | 0.012908 | O | spinel |
111 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.40504 | 1.5743 | 0.86869 | 0.003303 | ... | 0.000000 | 0.000000 | 0.007300 | 0.020957 | 0.032637 | 0.093801 | 0.002423 | 0.013150 | O | spinel |
2240 rows × 21 columns
In [12]:
# PCAを行う
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
In [13]:
data_cols = [ f'{i+1:d}' for i in range(ndim) ]
cdict = {'olivine':'red', 'spinel':'blue'}
In [14]:
proj = pca.fit_transform(dfs[dfs.species=='Mg'][data_cols])
fig, ax = plt.subplots(figsize=(5,5))
ax.scatter(proj[:,0], proj[:,1],
c=[ cdict[s] for s in dfs[dfs.species=='Mg'].struct],
alpha=0.5)
ax.set_title('PCA of Mg in Mg2SiO4')
ax.set_xlabel('PC-1')
ax.set_ylabel('PC-2')
plt.show()
In [15]:
pca.fit(dfs[dfs.species=='Mg'][data_cols])
proj_oli = pca.transform(dfs.query('species == "Mg" & struct == "olivine"')[data_cols])
proj_spi = pca.transform(dfs.query('species == "Mg" & struct == "spinel"')[data_cols])
#proj_spi = pca.transform(dfs[dfs.species=='Mg' & dfs.struct=='spinel'])
fig, ax = plt.subplots(figsize=(5,5))
sns.kdeplot(proj_oli[:,0], color='red', label='olivine', fill=True)
sns.kdeplot(proj_spi[:,0], color='blue', label='spinel', fill=True)
ax.set_xlabel('PC-1')
ax.set_ylabel('Density')
plt.legend(loc='best', frameon=False)
plt.show()
In [16]:
proj = pca.fit_transform(dfs[dfs.species=='Si'][data_cols])
fig, ax = plt.subplots(figsize=(5,5))
ax.scatter(proj[:,0], proj[:,1],
c=[ cdict[s] for s in dfs[dfs.species=='Si'].struct],
alpha=0.5)
ax.set_title('PCA of Si in Mg2SiO4')
ax.set_xlabel('PC-1')
ax.set_ylabel('PC-2')
plt.show()
In [17]:
proj = pca.fit_transform(dfs[dfs.species=='O'][data_cols])
fig, ax = plt.subplots(figsize=(5,5))
ax.scatter(proj[:,0], proj[:,1],
c=[ cdict[s] for s in dfs[dfs.species=='O'].struct],
alpha=0.5)
ax.set_xlabel('PC-1')
ax.set_ylabel('PC-2')
plt.show()
In [ ]: