from __future__ import print_function, division
import sys
import numpy as np
from numpy import stack, dot, zeros, einsum, array
from timeit import default_timer as timer
import scipy.linalg.blas as blas
from ctypes import c_double, c_int64, c_int, POINTER
import time
ts = time.time()
te = time.time()
try:
import numba as nb
use_numba = True
except:
use_numba = False
if use_numba:
[docs] @nb.jit(nopython=True, parallel=False)
def calc_part_rf0_numba(xvx, zxvx_re, zxvx_im):
rf0_re = np.zeros((zxvx_re.shape[0], zxvx_re.shape[0]), dtype=np.float64)
rf0_im = np.zeros((zxvx_re.shape[0], zxvx_re.shape[0]), dtype=np.float64)
for ib in range(zxvx_im.shape[2]):
rf0_tmp = zxvx_re[:, :, ib].dot(xvx[:, :, ib].T)
rf0_re += rf0_tmp
rf0_tmp = zxvx_im[:, :, ib].dot(xvx[:, :, ib].T)
rf0_im += rf0_tmp
return rf0_re + complex(0.0, 1.0)*rf0_im
[docs] @nb.jit(nopython=True, parallel=False)
def add_outer(a, b):
c = np.zeros((a.size, b.size))
for i in range(a.size):
for j in range(b.size):
c[i, j] = a[i] + b[j]
return c
[docs] @nb.jit(nopython=True, parallel=False)
def rf0_den_numba(rf0, comega, X, ksn2f, ksn2e, pab2v_den, nprod, norbs, bsize, nspin,
nfermi, vstart, dtype=np.complex128):
"""
Full matrix response in the basis of atom-centered product functions for
parallel spins.
Blas version to speed up matrix matrix multiplication
spped up of 7.237 compared to einsum version for C20 system
"""
for s in range(nspin):
nn = list(range(0, nfermi[s], bsize)) + [nfermi[s]]
mm = list(range(vstart[s], norbs, bsize)) + [norbs]
for nbs, nbf in zip(nn, nn[1:]):
vx = np.zeros((pab2v_den.shape[0], pab2v_den.shape[1], nbf-nbs))
#vx_ref = dot(pab2v_den, X[s,nbs:nbf,:].T)
for iv in range(pab2v_den.shape[0]):
vx[iv, :, :] = np.dot(pab2v_den[iv, :, :], X[s,nbs:nbf,:].T)
for mbs, mbf in zip(mm,mm[1:]):
xvx = calc_XVX_numba(X[s,mbs:mbf,:], vx)
fmn = add_outer(-ksn2f[0,s,mbs:mbf], ksn2f[0,s,nbs:nbf])
emn = add_outer( ksn2e[0,s,mbs:mbf], -ksn2e[0,s,nbs:nbf])
#zxvx = np.zeros((nprod, mbf-mbs, nbf-nbs), dtype=rf0.dtype)
zxvx = (xvx * fmn)* (1.0/ (comega - emn) - 1.0 / (comega + emn))
rf0_nb = calc_part_rf0_numba(xvx, zxvx.real, zxvx.imag)
rf0 += rf0_nb
return rf0
[docs] @nb.jit(nopython=True, parallel=False)
def calc_XVX_numba(X, VX, dtype=np.float64):
XVX = np.zeros((VX.shape[1], X.shape[0], VX.shape[2]), dtype=dtype)
for i in range(XVX.shape[2]):
XVX[:, :, i] = VX[:, :, i].T.dot(X.T)
return XVX
#@nb.jit(nopython=True, parallel=True)
# use numba here leads to the following issue in test_0087_o2_gw.py
# m_rf0_den.py:122: NumbaPerformanceWarning: np.dot() is faster
# on contiguous arrays, called on (array(float64, 2d, A), array(float64, 2d, C))
# np.dot(k_c.imag, kernel_sq)
[docs] def si_correlation_numba(si0, ww, X, kernel_sq, ksn2f, ksn2e, pab2v_den, nprod,
norbs, bsize, nspin, nfermi, vstart):
"""
This computes the correlation part of the screened interaction W_c
by solving <self.nprod> linear equations (1-K chi0) W = K chi0 K
or v_{ind}\sim W_{c} = (1-v\chi_{0})^{-1}v\chi_{0}v
scr_inter[w,p,q], where w in ww, p and q in 0..self.nprod
"""
for iw in range(ww.size):
#for iw in range(ww.size):
rf0_den_numba(si0[iw, :, :], ww[iw], X, ksn2f, ksn2e, pab2v_den,
nprod, norbs, bsize, nspin,
nfermi, vstart)
# divide ww into complex(w) which is along imaginary
# axis (real=0) and grid index(iw)
# kernel_sq or hkernel_den is bare coloumb or hartree, rf0
# is \chi_{0}, so here k_c=v*chi_{0}
k_c = np.dot(kernel_sq, si0[iw, :, :].real) + \
complex(0.0, 1.0)*np.dot(kernel_sq, si0[iw, :, :].imag)
# here v\chi_{0}v or k_c*v
b = np.dot(k_c.real, kernel_sq) + complex(0.0, 1.0)*\
np.dot(k_c.imag, kernel_sq)
# here (1-v\chi_{0}) or 1-k_c. 1=eye(nprod)
k_c = np.eye(nprod) - k_c
# k_c * W = v\chi_{0}v = b --> W = np.linalg.solve(K_c,b)
si0[iw,:,:] = np.linalg.solve(k_c, b)
return si0
[docs]def si_correlation(rf0, si0, ww, kernel_sq, nprod):
"""
This computes the correlation part of the screened interaction W_c
by solving <self.nprod> linear equations (1-K chi0) W = K chi0 K
or v_{ind}\sim W_{c} = (1-v\chi_{0})^{-1}v\chi_{0}v
scr_inter[w,p,q], where w in ww, p and q in 0..self.nprod
"""
for iw, w in enumerate(ww):
# divide ww into complex(w) which is along imaginary
# axis (real=0) and grid index(iw)
# kernel_sq or hkernel_den is bare coloumb or hartree, rf0
# is \chi_{0}, so here k_c=v*chi_{0}
k_c = np.dot(kernel_sq, rf0[iw,:,:])
# here v\chi_{0}v or k_c*v
b = np.dot(k_c, kernel_sq)
# here (1-v\chi_{0}) or 1-k_c. 1=eye(nprod)
k_c = np.eye(nprod) - k_c
# k_c * W = v\chi_{0}v = b --> W = np.linalg.solve(K_c,b)
si0[iw,:,:] = np.linalg.solve(k_c, b)
[docs]def calc_part_rf0(xvx, zxvx_re, zxvx_im):
rf0_re = np.zeros((zxvx_re.shape[0], zxvx_re.shape[1], zxvx_re.shape[1]), dtype=np.float64)
rf0_im = np.zeros((zxvx_re.shape[0], zxvx_re.shape[1], zxvx_re.shape[1]), dtype=np.float64)
for iw in range(zxvx_re.shape[0]):
for ib in range(zxvx_re.shape[3]):
rf0_tmp = run_blasDGEMM(1.0, zxvx_re[iw, :, :, ib],
xvx[:, :, ib], trans_a = 0, trans_b = 1)
rf0_re[iw, :, :] += rf0_tmp
rf0_tmp = run_blasDGEMM(1.0, zxvx_im[iw, :, :, ib],
xvx[:, :, ib], trans_a = 0, trans_b = 1)
rf0_im[iw, :, :] += rf0_tmp
return rf0_re + complex(0.0, 1.0)*rf0_im
[docs]def rf0_den(self, ww):
"""
Full matrix response in the basis of atom-centered product functions for
parallel spins.
Blas version to speed up matrix matrix multiplication
spped up of 7.237 compared to einsum version for C20 system
"""
rf0 = np.zeros((len(ww), self.nprod, self.nprod), dtype=self.dtypeComplex)
if hasattr(self, 'pab2v_den'):
v = self.pab2v_den
else:
self.pab2v_den = v = einsum('pab->apb', self.pb.get_ac_vertex_array())
zxvx = zeros((len(ww),self.nprod,self.bsize,self.bsize), dtype=self.dtypeComplex)
for s in range(self.nspin):
nn = list(range(0,self.nfermi[s],self.bsize))+[self.nfermi[s]]
mm = list(range(self.vstart[s],self.norbs,self.bsize))+[self.norbs]
for nbs,nbf in zip(nn,nn[1:]):
# replace this dot with a blas call to dgemv or zgemv
vx = dot(v, self.x[s,nbs:nbf,:].T)
#vx = blas.dgemv(v, self.x[s,nbs:nbf,:].T)
for mbs,mbf in zip(mm,mm[1:]):
xvx = calc_XVX(self.x[s,mbs:mbf,:], vx)
fmn = np.add.outer(-self.ksn2f[0,s,mbs:mbf], self.ksn2f[0,s,nbs:nbf])
emn = np.add.outer( self.ksn2e[0,s,mbs:mbf],-self.ksn2e[0,s,nbs:nbf])
zxvx.fill(0.0)
for iw, comega in enumerate(ww):
zxvx[iw,:,0:mbf-mbs,0:nbf-nbs] = (xvx * fmn)* (1.0/ (comega - emn) - 1.0 / (comega + emn))
rf0_nb = calc_part_rf0(xvx, zxvx[:, :, 0:mbf-mbs, 0:nbf-nbs].real,
zxvx[:, :, 0:mbf-mbs, 0:nbf-nbs].imag)
rf0 += rf0_nb
return rf0
[docs]def run_blasZGEMM(alpha, A, B, **kwargs):
return blas.zgemm(alpha, A, B, **kwargs)
[docs]def run_blasDGEMM(alpha, A, B, **kwargs):
return blas.dgemm(alpha, A, B, **kwargs)
[docs]def rf0_den_einsum(self, ww):
"""
Full matrix response in the basis of atom-centered product functions for
parallel spins
einsum version, slow
"""
rf0 = np.zeros((len(ww), self.nprod, self.nprod), dtype=self.dtypeComplex)
if hasattr(self, 'pab2v_den'):
v = self.pab2v_den
else:
self.pab2v_den = v = einsum('pab->apb', self.pb.get_ac_vertex_array())
zxvx = zeros((len(ww),self.nprod,self.bsize,self.bsize), dtype=self.dtypeComplex)
for s in range(self.nspin):
nn = list(range(0,self.nfermi[s],self.bsize))+[self.nfermi[s]]
mm = list(range(self.vstart[s],self.norbs,self.bsize))+[self.norbs]
for nbs,nbf in zip(nn,nn[1:]):
vx = dot(v, self.x[s,nbs:nbf,:].T)
for mbs,mbf in zip(mm,mm[1:]):
xvx = einsum('mb,bpn->pmn', self.x[s,mbs:mbf,:],vx)
fmn = np.add.outer(-self.ksn2f[0,s,mbs:mbf], self.ksn2f[0,s,nbs:nbf])
emn = np.add.outer( self.ksn2e[0,s,mbs:mbf],-self.ksn2e[0,s,nbs:nbf])
zxvx.fill(0.0)
for iw,comega in enumerate(ww):
zxvx[iw,:,0:mbf-mbs,0:nbf-nbs] = (xvx * fmn)* (1.0/ (comega - emn) - 1.0 / (comega + emn))
rf0 += einsum('wpmn,qmn->wpq', zxvx[...,0:mbf-mbs,0:nbf-nbs], xvx)
return rf0
[docs]def calc_XVX(X, VX, dtype=np.float64):
XVX = np.zeros((VX.shape[1], X.shape[0], VX.shape[2]), dtype=dtype)
for i in range(XVX.shape[2]):
#XVX[:, :, i] = blas.dgemm(1.0, VX[:, :, i], X, trans_a=1, trans_b=1)
XVX[:, :, i] = VX[:, :, i].T.dot(X.T)
return XVX
[docs]def rf0_cmplx_ref_blk(self, ww):
""" Full matrix response in the basis of atom-centered product functions """
rf0 = np.zeros((len(ww), self.nprod, self.nprod), dtype=self.dtypeComplex)
v = einsum('pab->apb', self.pb.get_ac_vertex_array())
#print('v.shape', v.shape)
t1 = timer()
if self.verbosity>1: print(__name__, 'self.ksn2e', self.ksn2e, len(ww))
bsize = 40
zxvx = zeros((len(ww), self.nprod,bsize,bsize), dtype=self.dtypeComplex)
sn2e = self.ksn2e.reshape((self.nspin*self.norbs))
sn2f = self.ksn2f.reshape((self.nspin*self.norbs))
sn2x = self.x.reshape((self.nspin*self.norbs,self.norbs))
nn = range(0,len(sn2x),bsize)+[len(sn2x)]
#print('nn = ', nn, sn2x.shape, len(sn2x))
for nbs,nbf in zip(nn,nn[1:]):
vx = dot(v, sn2x[nbs:nbf,:].T)
for mbs,mbf in zip(nn,nn[1:]):
xvx = einsum('mb,bpn->pmn', sn2x[mbs:mbf,:],vx)
fmn = np.add.outer(-sn2f[mbs:mbf], sn2f[nbs:nbf])
emn = np.add.outer( sn2e[mbs:mbf],-sn2e[nbs:nbf])
zxvx.fill(0.0)
for iw,comega in enumerate(ww):
zxvx[iw,:,0:mbf-mbs,0:nbf-nbs] = (xvx * fmn) / (comega - emn)
rf0 += einsum('wpmn,qmn->wpq', zxvx[...,0:mbf-mbs,0:nbf-nbs], xvx)
t2 = timer()
if self.verbosity>0: print(__name__, 'rf0_ref_blk', t2-t1)
return rf0
[docs]def rf0_cmplx_ref(self, ww):
""" Full matrix response in the basis of atom-centered product functions """
rf0 = np.zeros((len(ww), self.nprod, self.nprod), dtype=self.dtypeComplex)
v = self.pb.get_ac_vertex_array()
t1 = timer()
if self.verbosity>1: print(__name__, 'self.ksn2e', self.ksn2e, len(ww))
zvxx_a = zeros((len(ww), self.nprod), dtype=self.dtypeComplex)
for s in range(self.nspin):
n2e = self.ksn2e[0,s,:]
n2f = self.ksn2f[0,s,:]
n2x = self.x[s,:,:]
for en,fn,xn in zip(n2e,n2f,n2x):
vx = dot(v, xn)
for em,fm,xm in zip(n2e,n2f,n2x):
vxx_a = dot(vx, xm.T)
for iw,comega in enumerate(ww):
zvxx_a[iw,:] = vxx_a * (fn - fm)/ (comega - (em - en))
rf0 += einsum('wa,b->wab', zvxx_a, vxx_a)
t2 = timer()
if self.verbosity>0: print(__name__, 'rf0_ref_loop', t2-t1)
return rf0
[docs]def rf0_cmplx_vertex_dp(self, ww):
""" Full matrix response in the basis of atom-centered product functions """
rf0 = np.zeros((len(ww), self.nprod, self.nprod), dtype=self.dtypeComplex)
v_arr = self.pb.get_dp_vertex_array()
zvxx_a = zeros((len(ww), self.nprod), dtype=self.dtypeComplex)
for n,(en,fn) in enumerate(zip(self.ksn2e[0,0,0:self.nfermi], self.ksn2f[0, 0, 0:self.nfermi])):
vx = dot(v_arr, self.xocc[0][n,:])
for m,(em,fm) in enumerate(zip(self.ksn2e[0,0,self.vstart:],self.ksn2f[0,0,self.vstart:])):
if (fn - fm)<0 : break
vxx_a = dot(vx, self.xvrt[0][m,:]) * self.cc_da
for iw,comega in enumerate(ww):
zvxx_a[iw,:] = vxx_a * (fn - fm) * ( 1.0 / (comega - (em - en)) - 1.0 / (comega + (em - en)) )
rf0 = rf0 + einsum('wa,b->wab', zvxx_a, vxx_a)
return rf0
[docs]def rf0_cmplx_vertex_ac(self, ww):
""" Full matrix response in the basis of atom-centered product functions """
rf0 = np.zeros((len(ww), self.nprod, self.nprod), dtype=self.dtypeComplex)
v = self.pb.get_ac_vertex_array()
t1 = timer()
if self.verbosity>1: print(__name__, 'self.ksn2e', self.ksn2e, len(ww))
#print(self.ksn2e[0,0,0]-self.ksn2e)
#print(self.ksn2f)
#print(' abs(v).sum(), ww.sum(), self.nfermi, self.vstart ')
#print(abs(v).sum(), ww.sum(), self.nfermi, self.vstart)
zvxx_a = zeros((len(ww), self.nprod), dtype=self.dtypeComplex)
for n,(en,fn) in enumerate(zip(self.ksn2e[0,0,0:self.nfermi[0]], self.ksn2f[0, 0, 0:self.nfermi[0]])):
#if self.verbosity>1: print(__name__, 'n =', n)
vx = dot(v, self.xocc[0][n,:])
for m,(em,fm) in enumerate(zip(self.ksn2e[0,0,self.vstart[0]:],self.ksn2f[0,0,self.vstart[0]:])):
if (fn - fm)<0 : break
vxx_a = dot(vx, self.xvrt[0][m,:].T)
for iw,comega in enumerate(ww):
zvxx_a[iw,:] = vxx_a * (fn - fm) * ( 1.0 / (comega - (em - en)) - 1.0 / (comega + (em - en)) )
rf0 += einsum('wa,b->wab', zvxx_a, vxx_a)
t2 = timer()
if self.verbosity>1: print(__name__, 'finished rf0', t2-t1)
return rf0