Mg2SiO4のolivineとspinelのMDスナップショットのdescriptorを用いた比較¶

  • olivine, spinelともに 1000K, 20GPa のMDからのスナップショット(112原子 x 10構造)
  • Behler-Parrinello型symmetry function (BPSF) ndim次元のデータ.これは少し説明が必要だが,BPSFを大量に用意して,Orthogonal matching pursuit (OMP) という手法で次元削減したもの.
  • pmdからBPSF descriptorを出力したもの(out.desc.gsf)を読み込んで pandas.DataFrame に変換し,PCAにかける.
In [5]:
%matplotlib inline
import os, sys
import glob
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import nappy
import pandas as pd

np.set_printoptions(precision=3, floatmode='fixed')
sns.set_theme(context='talk', style='ticks')
In [2]:
# olivine, spinel の各ディレクトリ内のout.desc.gsf_### ファイルリストを取得
gsfs_oli = glob.glob('mkdesc_olivine/out.desc.gsf_[1-9]*')
gsfs_spi = glob.glob('mkdesc_spinel/out.desc.gsf_[1-9]*')
print(gsfs_oli, gsfs_spi)
['mkdesc_olivine/out.desc.gsf_4000', 'mkdesc_olivine/out.desc.gsf_1000', 'mkdesc_olivine/out.desc.gsf_5000', 'mkdesc_olivine/out.desc.gsf_8000', 'mkdesc_olivine/out.desc.gsf_9000', 'mkdesc_olivine/out.desc.gsf_2000', 'mkdesc_olivine/out.desc.gsf_10000', 'mkdesc_olivine/out.desc.gsf_6000', 'mkdesc_olivine/out.desc.gsf_7000', 'mkdesc_olivine/out.desc.gsf_3000'] ['mkdesc_spinel/out.desc.gsf_4000', 'mkdesc_spinel/out.desc.gsf_1000', 'mkdesc_spinel/out.desc.gsf_5000', 'mkdesc_spinel/out.desc.gsf_8000', 'mkdesc_spinel/out.desc.gsf_9000', 'mkdesc_spinel/out.desc.gsf_2000', 'mkdesc_spinel/out.desc.gsf_10000', 'mkdesc_spinel/out.desc.gsf_6000', 'mkdesc_spinel/out.desc.gsf_7000', 'mkdesc_spinel/out.desc.gsf_3000']
In [7]:
ndim = 19
In [3]:
# descファイルを読み込む関数
def read_desc(fname):
    import numpy as np
    with open(fname, 'r') as f:
        lines = f.readlines()
    natm = -1
    nsf = -1
    desc = None
    inc = -1
    for i,l in enumerate(lines):
        data = l.split()
        if len(data) == 0:
            continue
        if l[0] == '#':
            if data[1] == 'natm:':
                natm = int(data[2])
            elif data[1] == 'nsf:':
                nsf = int(data[2])
        if natm < 0 or nsf < 0:
            continue
        else:
            if desc is None:
                desc = np.zeros((natm,nsf))
                inc = 0
                continue
        ds = [ float(d) for d in data ]
        desc[inc,:] = ds[:]
        inc += 1
    return desc
In [8]:
# olivine構造のDataFrame作成
nsys = nappy.io.read('pmdini_olivine')
species = nsys.get_symbols()
data_oli = {'data':np.zeros([0,0]), 'species':np.zeros([0])}
df_oli = None
for gsf in gsfs_oli:
    print(gsf)
    di = read_desc(gsf)
    df1 = pd.DataFrame(di, columns=[f'{i+1:d}' for i in range(ndim)])
    df2 = pd.DataFrame({'species':species, 
                        'struct':['olivine' for i,s in enumerate(species)]})
    df = pd.concat([df1,df2], axis=1)
    if df_oli is None:
        df_oli = df
    else:
        df_oli = pd.concat([df_oli, df])
df_oli
mkdesc_olivine/out.desc.gsf_4000
mkdesc_olivine/out.desc.gsf_1000
mkdesc_olivine/out.desc.gsf_5000
mkdesc_olivine/out.desc.gsf_8000
mkdesc_olivine/out.desc.gsf_9000
mkdesc_olivine/out.desc.gsf_2000
mkdesc_olivine/out.desc.gsf_10000
mkdesc_olivine/out.desc.gsf_6000
mkdesc_olivine/out.desc.gsf_7000
mkdesc_olivine/out.desc.gsf_3000
Out[8]:
1 2 3 4 5 6 7 8 9 10 ... 12 13 14 15 16 17 18 19 species struct
0 0.000539 0.62871 0.101800 0.000263 0.72694 0.063504 0.80328 2.7978 1.18750 0.002173 ... 0.049897 0.092742 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 Mg olivine
1 0.000715 0.63984 0.118260 0.000335 0.82090 0.076565 0.78271 2.9897 1.51180 0.005989 ... 0.064914 0.120080 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 Mg olivine
2 0.000390 0.61289 0.081793 0.000200 0.63668 0.046893 0.77756 2.5789 1.07860 0.002967 ... 0.034090 0.067948 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 Mg olivine
3 0.000531 0.63249 0.101910 0.000095 0.73328 0.032303 0.82981 2.7731 1.10470 0.001249 ... 0.048499 0.086804 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 Mg olivine
4 0.000120 0.57602 0.034848 0.000027 0.42686 0.010140 0.74274 2.7256 1.18550 0.007026 ... 0.040301 0.083489 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 Mg olivine
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
107 0.000000 0.00000 0.000000 0.000000 0.00000 0.000000 0.34932 1.4168 0.63966 0.001363 ... 0.000000 0.000000 0.024149 0.005478 0.011140 0.020897 0.020047 0.033063 O olivine
108 0.000000 0.00000 0.000000 0.000000 0.00000 0.000000 0.36231 1.5187 0.78218 0.002611 ... 0.000000 0.000000 0.006454 0.019646 0.029715 0.082459 0.002062 0.011103 O olivine
109 0.000000 0.00000 0.000000 0.000000 0.00000 0.000000 0.41487 1.3286 0.52607 0.000922 ... 0.000000 0.000000 0.001635 0.012877 0.018680 0.081466 0.000908 0.007234 O olivine
110 0.000000 0.00000 0.000000 0.000000 0.00000 0.000000 0.40870 1.3452 0.49501 0.000348 ... 0.000000 0.000000 0.003997 0.010358 0.016184 0.066220 0.001656 0.009500 O olivine
111 0.000000 0.00000 0.000000 0.000000 0.00000 0.000000 0.38783 1.4806 0.70626 0.001703 ... 0.000000 0.000000 0.004800 0.017835 0.026778 0.091580 0.001917 0.010966 O olivine

1120 rows × 21 columns

In [10]:
# spinel構造のDataFrame作成
nsys = nappy.io.read('pmdini_spinel')
species = nsys.get_symbols()
df_spi = None
for gsf in gsfs_spi:
    print(gsf)
    di = read_desc(gsf)
    df1 = pd.DataFrame(di, columns=[f'{i+1:d}' for i in range(ndim)])
    df2 = pd.DataFrame({'species':species, 
                        'struct':['spinel' for i,s in enumerate(species)]})
    df = pd.concat([df1,df2], axis=1)
    if df_spi is None:
        df_spi = df
    else:
        df_spi = pd.concat([df_spi, df])

df_spi
mkdesc_spinel/out.desc.gsf_4000
mkdesc_spinel/out.desc.gsf_1000
mkdesc_spinel/out.desc.gsf_5000
mkdesc_spinel/out.desc.gsf_8000
mkdesc_spinel/out.desc.gsf_9000
mkdesc_spinel/out.desc.gsf_2000
mkdesc_spinel/out.desc.gsf_10000
mkdesc_spinel/out.desc.gsf_6000
mkdesc_spinel/out.desc.gsf_7000
mkdesc_spinel/out.desc.gsf_3000
Out[10]:
1 2 3 4 5 6 7 8 9 10 ... 12 13 14 15 16 17 18 19 species struct
0 0.000024 0.41971 0.009505 0.000580 1.0880 0.13376 0.82408 2.8157 1.21900 0.002809 ... 0.045968 0.090854 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 Mg spinel
1 0.000018 0.43576 0.007852 0.000592 1.0729 0.13182 0.81081 2.8603 1.27690 0.002923 ... 0.047791 0.097070 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 Mg spinel
2 0.000014 0.38711 0.006169 0.000537 1.1061 0.12992 0.80708 2.8047 1.19560 0.001932 ... 0.047913 0.093557 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 Mg spinel
3 0.000016 0.41068 0.006867 0.000738 1.1448 0.16176 0.85449 2.8887 1.31860 0.003395 ... 0.049827 0.100510 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 Mg spinel
4 0.000022 0.47263 0.009383 0.000506 1.0985 0.12513 0.80757 2.8497 1.32410 0.004889 ... 0.048869 0.100420 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 Mg spinel
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
107 0.000000 0.00000 0.000000 0.000000 0.0000 0.00000 0.44368 1.2912 0.44298 0.000349 ... 0.000000 0.000000 0.000268 0.014644 0.019666 0.081814 0.000259 0.005240 O spinel
108 0.000000 0.00000 0.000000 0.000000 0.0000 0.00000 0.38360 1.4027 0.63210 0.001478 ... 0.000000 0.000000 0.001865 0.017336 0.024916 0.079120 0.001912 0.009930 O spinel
109 0.000000 0.00000 0.000000 0.000000 0.0000 0.00000 0.44118 1.4429 0.61205 0.000811 ... 0.000000 0.000000 0.001185 0.019475 0.027238 0.085882 0.000710 0.007248 O spinel
110 0.000000 0.00000 0.000000 0.000000 0.0000 0.00000 0.38545 1.5511 0.83206 0.002816 ... 0.000000 0.000000 0.004023 0.023825 0.035074 0.098466 0.002795 0.012908 O spinel
111 0.000000 0.00000 0.000000 0.000000 0.0000 0.00000 0.40504 1.5743 0.86869 0.003303 ... 0.000000 0.000000 0.007300 0.020957 0.032637 0.093801 0.002423 0.013150 O spinel

1120 rows × 21 columns

In [11]:
# 全て結合しての1つのDF作成
dfs = pd.concat([df_oli, df_spi], axis=0)
dfs
Out[11]:
1 2 3 4 5 6 7 8 9 10 ... 12 13 14 15 16 17 18 19 species struct
0 0.000539 0.62871 0.101800 0.000263 0.72694 0.063504 0.80328 2.7978 1.18750 0.002173 ... 0.049897 0.092742 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 Mg olivine
1 0.000715 0.63984 0.118260 0.000335 0.82090 0.076565 0.78271 2.9897 1.51180 0.005989 ... 0.064914 0.120080 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 Mg olivine
2 0.000390 0.61289 0.081793 0.000200 0.63668 0.046893 0.77756 2.5789 1.07860 0.002967 ... 0.034090 0.067948 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 Mg olivine
3 0.000531 0.63249 0.101910 0.000095 0.73328 0.032303 0.82981 2.7731 1.10470 0.001249 ... 0.048499 0.086804 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 Mg olivine
4 0.000120 0.57602 0.034848 0.000027 0.42686 0.010140 0.74274 2.7256 1.18550 0.007026 ... 0.040301 0.083489 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 Mg olivine
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
107 0.000000 0.00000 0.000000 0.000000 0.00000 0.000000 0.44368 1.2912 0.44298 0.000349 ... 0.000000 0.000000 0.000268 0.014644 0.019666 0.081814 0.000259 0.005240 O spinel
108 0.000000 0.00000 0.000000 0.000000 0.00000 0.000000 0.38360 1.4027 0.63210 0.001478 ... 0.000000 0.000000 0.001865 0.017336 0.024916 0.079120 0.001912 0.009930 O spinel
109 0.000000 0.00000 0.000000 0.000000 0.00000 0.000000 0.44118 1.4429 0.61205 0.000811 ... 0.000000 0.000000 0.001185 0.019475 0.027238 0.085882 0.000710 0.007248 O spinel
110 0.000000 0.00000 0.000000 0.000000 0.00000 0.000000 0.38545 1.5511 0.83206 0.002816 ... 0.000000 0.000000 0.004023 0.023825 0.035074 0.098466 0.002795 0.012908 O spinel
111 0.000000 0.00000 0.000000 0.000000 0.00000 0.000000 0.40504 1.5743 0.86869 0.003303 ... 0.000000 0.000000 0.007300 0.020957 0.032637 0.093801 0.002423 0.013150 O spinel

2240 rows × 21 columns

In [12]:
# PCAを行う
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
In [13]:
data_cols = [ f'{i+1:d}' for i in range(ndim) ]
cdict = {'olivine':'red', 'spinel':'blue'}
In [14]:
proj = pca.fit_transform(dfs[dfs.species=='Mg'][data_cols])
fig, ax = plt.subplots(figsize=(5,5))
ax.scatter(proj[:,0], proj[:,1], 
           c=[ cdict[s] for s in dfs[dfs.species=='Mg'].struct],
           alpha=0.5)
ax.set_title('PCA of Mg in Mg2SiO4')
ax.set_xlabel('PC-1')
ax.set_ylabel('PC-2')
plt.show()
No description has been provided for this image
In [15]:
pca.fit(dfs[dfs.species=='Mg'][data_cols])
proj_oli = pca.transform(dfs.query('species == "Mg" & struct == "olivine"')[data_cols])
proj_spi = pca.transform(dfs.query('species == "Mg" & struct == "spinel"')[data_cols])
#proj_spi = pca.transform(dfs[dfs.species=='Mg' & dfs.struct=='spinel'])
fig, ax = plt.subplots(figsize=(5,5))
sns.kdeplot(proj_oli[:,0], color='red', label='olivine', fill=True)
sns.kdeplot(proj_spi[:,0], color='blue', label='spinel', fill=True)
ax.set_xlabel('PC-1')
ax.set_ylabel('Density')
plt.legend(loc='best', frameon=False)
plt.show()
No description has been provided for this image
In [16]:
proj = pca.fit_transform(dfs[dfs.species=='Si'][data_cols])
fig, ax = plt.subplots(figsize=(5,5))
ax.scatter(proj[:,0], proj[:,1], 
           c=[ cdict[s] for s in dfs[dfs.species=='Si'].struct],
           alpha=0.5)
ax.set_title('PCA of Si in Mg2SiO4')
ax.set_xlabel('PC-1')
ax.set_ylabel('PC-2')
plt.show()
No description has been provided for this image
In [17]:
proj = pca.fit_transform(dfs[dfs.species=='O'][data_cols])
fig, ax = plt.subplots(figsize=(5,5))
ax.scatter(proj[:,0], proj[:,1], 
           c=[ cdict[s] for s in dfs[dfs.species=='O'].struct],
           alpha=0.5)
ax.set_xlabel('PC-1')
ax.set_ylabel('PC-2')
plt.show()
No description has been provided for this image
In [ ]: