Source code for pynao.m_rf0_den

from __future__ import print_function, division
import sys
import numpy as np
from numpy import stack, dot, zeros, einsum, array
from timeit import default_timer as timer
import scipy.linalg.blas as blas

from ctypes import c_double, c_int64, c_int, POINTER

import time
ts = time.time()
te = time.time()

try:
  import numba as nb
  use_numba = True
except:
  use_numba = False

if use_numba:

[docs] @nb.jit(nopython=True, parallel=False) def calc_part_rf0_numba(xvx, zxvx_re, zxvx_im): rf0_re = np.zeros((zxvx_re.shape[0], zxvx_re.shape[0]), dtype=np.float64) rf0_im = np.zeros((zxvx_re.shape[0], zxvx_re.shape[0]), dtype=np.float64) for ib in range(zxvx_im.shape[2]): rf0_tmp = zxvx_re[:, :, ib].dot(xvx[:, :, ib].T) rf0_re += rf0_tmp rf0_tmp = zxvx_im[:, :, ib].dot(xvx[:, :, ib].T) rf0_im += rf0_tmp return rf0_re + complex(0.0, 1.0)*rf0_im
[docs] @nb.jit(nopython=True, parallel=False) def add_outer(a, b): c = np.zeros((a.size, b.size)) for i in range(a.size): for j in range(b.size): c[i, j] = a[i] + b[j] return c
[docs] @nb.jit(nopython=True, parallel=False) def rf0_den_numba(rf0, comega, X, ksn2f, ksn2e, pab2v_den, nprod, norbs, bsize, nspin, nfermi, vstart, dtype=np.complex128): """ Full matrix response in the basis of atom-centered product functions for parallel spins. Blas version to speed up matrix matrix multiplication spped up of 7.237 compared to einsum version for C20 system """ for s in range(nspin): nn = list(range(0, nfermi[s], bsize)) + [nfermi[s]] mm = list(range(vstart[s], norbs, bsize)) + [norbs] for nbs, nbf in zip(nn, nn[1:]): vx = np.zeros((pab2v_den.shape[0], pab2v_den.shape[1], nbf-nbs)) #vx_ref = dot(pab2v_den, X[s,nbs:nbf,:].T) for iv in range(pab2v_den.shape[0]): vx[iv, :, :] = np.dot(pab2v_den[iv, :, :], X[s,nbs:nbf,:].T) for mbs, mbf in zip(mm,mm[1:]): xvx = calc_XVX_numba(X[s,mbs:mbf,:], vx) fmn = add_outer(-ksn2f[0,s,mbs:mbf], ksn2f[0,s,nbs:nbf]) emn = add_outer( ksn2e[0,s,mbs:mbf], -ksn2e[0,s,nbs:nbf]) #zxvx = np.zeros((nprod, mbf-mbs, nbf-nbs), dtype=rf0.dtype) zxvx = (xvx * fmn)* (1.0/ (comega - emn) - 1.0 / (comega + emn)) rf0_nb = calc_part_rf0_numba(xvx, zxvx.real, zxvx.imag) rf0 += rf0_nb return rf0
[docs] @nb.jit(nopython=True, parallel=False) def calc_XVX_numba(X, VX, dtype=np.float64): XVX = np.zeros((VX.shape[1], X.shape[0], VX.shape[2]), dtype=dtype) for i in range(XVX.shape[2]): XVX[:, :, i] = VX[:, :, i].T.dot(X.T) return XVX
#@nb.jit(nopython=True, parallel=True) # use numba here leads to the following issue in test_0087_o2_gw.py # m_rf0_den.py:122: NumbaPerformanceWarning: np.dot() is faster # on contiguous arrays, called on (array(float64, 2d, A), array(float64, 2d, C)) # np.dot(k_c.imag, kernel_sq)
[docs] def si_correlation_numba(si0, ww, X, kernel_sq, ksn2f, ksn2e, pab2v_den, nprod, norbs, bsize, nspin, nfermi, vstart): """ This computes the correlation part of the screened interaction W_c by solving <self.nprod> linear equations (1-K chi0) W = K chi0 K or v_{ind}\sim W_{c} = (1-v\chi_{0})^{-1}v\chi_{0}v scr_inter[w,p,q], where w in ww, p and q in 0..self.nprod """ for iw in range(ww.size): #for iw in range(ww.size): rf0_den_numba(si0[iw, :, :], ww[iw], X, ksn2f, ksn2e, pab2v_den, nprod, norbs, bsize, nspin, nfermi, vstart) # divide ww into complex(w) which is along imaginary # axis (real=0) and grid index(iw) # kernel_sq or hkernel_den is bare coloumb or hartree, rf0 # is \chi_{0}, so here k_c=v*chi_{0} k_c = np.dot(kernel_sq, si0[iw, :, :].real) + \ complex(0.0, 1.0)*np.dot(kernel_sq, si0[iw, :, :].imag) # here v\chi_{0}v or k_c*v b = np.dot(k_c.real, kernel_sq) + complex(0.0, 1.0)*\ np.dot(k_c.imag, kernel_sq) # here (1-v\chi_{0}) or 1-k_c. 1=eye(nprod) k_c = np.eye(nprod) - k_c # k_c * W = v\chi_{0}v = b --> W = np.linalg.solve(K_c,b) si0[iw,:,:] = np.linalg.solve(k_c, b) return si0
[docs]def si_correlation(rf0, si0, ww, kernel_sq, nprod): """ This computes the correlation part of the screened interaction W_c by solving <self.nprod> linear equations (1-K chi0) W = K chi0 K or v_{ind}\sim W_{c} = (1-v\chi_{0})^{-1}v\chi_{0}v scr_inter[w,p,q], where w in ww, p and q in 0..self.nprod """ for iw, w in enumerate(ww): # divide ww into complex(w) which is along imaginary # axis (real=0) and grid index(iw) # kernel_sq or hkernel_den is bare coloumb or hartree, rf0 # is \chi_{0}, so here k_c=v*chi_{0} k_c = np.dot(kernel_sq, rf0[iw,:,:]) # here v\chi_{0}v or k_c*v b = np.dot(k_c, kernel_sq) # here (1-v\chi_{0}) or 1-k_c. 1=eye(nprod) k_c = np.eye(nprod) - k_c # k_c * W = v\chi_{0}v = b --> W = np.linalg.solve(K_c,b) si0[iw,:,:] = np.linalg.solve(k_c, b)
[docs]def calc_part_rf0(xvx, zxvx_re, zxvx_im): rf0_re = np.zeros((zxvx_re.shape[0], zxvx_re.shape[1], zxvx_re.shape[1]), dtype=np.float64) rf0_im = np.zeros((zxvx_re.shape[0], zxvx_re.shape[1], zxvx_re.shape[1]), dtype=np.float64) for iw in range(zxvx_re.shape[0]): for ib in range(zxvx_re.shape[3]): rf0_tmp = run_blasDGEMM(1.0, zxvx_re[iw, :, :, ib], xvx[:, :, ib], trans_a = 0, trans_b = 1) rf0_re[iw, :, :] += rf0_tmp rf0_tmp = run_blasDGEMM(1.0, zxvx_im[iw, :, :, ib], xvx[:, :, ib], trans_a = 0, trans_b = 1) rf0_im[iw, :, :] += rf0_tmp return rf0_re + complex(0.0, 1.0)*rf0_im
[docs]def rf0_den(self, ww): """ Full matrix response in the basis of atom-centered product functions for parallel spins. Blas version to speed up matrix matrix multiplication spped up of 7.237 compared to einsum version for C20 system """ rf0 = np.zeros((len(ww), self.nprod, self.nprod), dtype=self.dtypeComplex) if hasattr(self, 'pab2v_den'): v = self.pab2v_den else: self.pab2v_den = v = einsum('pab->apb', self.pb.get_ac_vertex_array()) zxvx = zeros((len(ww),self.nprod,self.bsize,self.bsize), dtype=self.dtypeComplex) for s in range(self.nspin): nn = list(range(0,self.nfermi[s],self.bsize))+[self.nfermi[s]] mm = list(range(self.vstart[s],self.norbs,self.bsize))+[self.norbs] for nbs,nbf in zip(nn,nn[1:]): # replace this dot with a blas call to dgemv or zgemv vx = dot(v, self.x[s,nbs:nbf,:].T) #vx = blas.dgemv(v, self.x[s,nbs:nbf,:].T) for mbs,mbf in zip(mm,mm[1:]): xvx = calc_XVX(self.x[s,mbs:mbf,:], vx) fmn = np.add.outer(-self.ksn2f[0,s,mbs:mbf], self.ksn2f[0,s,nbs:nbf]) emn = np.add.outer( self.ksn2e[0,s,mbs:mbf],-self.ksn2e[0,s,nbs:nbf]) zxvx.fill(0.0) for iw, comega in enumerate(ww): zxvx[iw,:,0:mbf-mbs,0:nbf-nbs] = (xvx * fmn)* (1.0/ (comega - emn) - 1.0 / (comega + emn)) rf0_nb = calc_part_rf0(xvx, zxvx[:, :, 0:mbf-mbs, 0:nbf-nbs].real, zxvx[:, :, 0:mbf-mbs, 0:nbf-nbs].imag) rf0 += rf0_nb return rf0
[docs]def run_blasZGEMM(alpha, A, B, **kwargs): return blas.zgemm(alpha, A, B, **kwargs)
[docs]def run_blasDGEMM(alpha, A, B, **kwargs): return blas.dgemm(alpha, A, B, **kwargs)
[docs]def rf0_den_einsum(self, ww): """ Full matrix response in the basis of atom-centered product functions for parallel spins einsum version, slow """ rf0 = np.zeros((len(ww), self.nprod, self.nprod), dtype=self.dtypeComplex) if hasattr(self, 'pab2v_den'): v = self.pab2v_den else: self.pab2v_den = v = einsum('pab->apb', self.pb.get_ac_vertex_array()) zxvx = zeros((len(ww),self.nprod,self.bsize,self.bsize), dtype=self.dtypeComplex) for s in range(self.nspin): nn = list(range(0,self.nfermi[s],self.bsize))+[self.nfermi[s]] mm = list(range(self.vstart[s],self.norbs,self.bsize))+[self.norbs] for nbs,nbf in zip(nn,nn[1:]): vx = dot(v, self.x[s,nbs:nbf,:].T) for mbs,mbf in zip(mm,mm[1:]): xvx = einsum('mb,bpn->pmn', self.x[s,mbs:mbf,:],vx) fmn = np.add.outer(-self.ksn2f[0,s,mbs:mbf], self.ksn2f[0,s,nbs:nbf]) emn = np.add.outer( self.ksn2e[0,s,mbs:mbf],-self.ksn2e[0,s,nbs:nbf]) zxvx.fill(0.0) for iw,comega in enumerate(ww): zxvx[iw,:,0:mbf-mbs,0:nbf-nbs] = (xvx * fmn)* (1.0/ (comega - emn) - 1.0 / (comega + emn)) rf0 += einsum('wpmn,qmn->wpq', zxvx[...,0:mbf-mbs,0:nbf-nbs], xvx) return rf0
[docs]def calc_XVX(X, VX, dtype=np.float64): XVX = np.zeros((VX.shape[1], X.shape[0], VX.shape[2]), dtype=dtype) for i in range(XVX.shape[2]): #XVX[:, :, i] = blas.dgemm(1.0, VX[:, :, i], X, trans_a=1, trans_b=1) XVX[:, :, i] = VX[:, :, i].T.dot(X.T) return XVX
[docs]def rf0_cmplx_ref_blk(self, ww): """ Full matrix response in the basis of atom-centered product functions """ rf0 = np.zeros((len(ww), self.nprod, self.nprod), dtype=self.dtypeComplex) v = einsum('pab->apb', self.pb.get_ac_vertex_array()) #print('v.shape', v.shape) t1 = timer() if self.verbosity>1: print(__name__, 'self.ksn2e', self.ksn2e, len(ww)) bsize = 40 zxvx = zeros((len(ww), self.nprod,bsize,bsize), dtype=self.dtypeComplex) sn2e = self.ksn2e.reshape((self.nspin*self.norbs)) sn2f = self.ksn2f.reshape((self.nspin*self.norbs)) sn2x = self.x.reshape((self.nspin*self.norbs,self.norbs)) nn = range(0,len(sn2x),bsize)+[len(sn2x)] #print('nn = ', nn, sn2x.shape, len(sn2x)) for nbs,nbf in zip(nn,nn[1:]): vx = dot(v, sn2x[nbs:nbf,:].T) for mbs,mbf in zip(nn,nn[1:]): xvx = einsum('mb,bpn->pmn', sn2x[mbs:mbf,:],vx) fmn = np.add.outer(-sn2f[mbs:mbf], sn2f[nbs:nbf]) emn = np.add.outer( sn2e[mbs:mbf],-sn2e[nbs:nbf]) zxvx.fill(0.0) for iw,comega in enumerate(ww): zxvx[iw,:,0:mbf-mbs,0:nbf-nbs] = (xvx * fmn) / (comega - emn) rf0 += einsum('wpmn,qmn->wpq', zxvx[...,0:mbf-mbs,0:nbf-nbs], xvx) t2 = timer() if self.verbosity>0: print(__name__, 'rf0_ref_blk', t2-t1) return rf0
[docs]def rf0_cmplx_ref(self, ww): """ Full matrix response in the basis of atom-centered product functions """ rf0 = np.zeros((len(ww), self.nprod, self.nprod), dtype=self.dtypeComplex) v = self.pb.get_ac_vertex_array() t1 = timer() if self.verbosity>1: print(__name__, 'self.ksn2e', self.ksn2e, len(ww)) zvxx_a = zeros((len(ww), self.nprod), dtype=self.dtypeComplex) for s in range(self.nspin): n2e = self.ksn2e[0,s,:] n2f = self.ksn2f[0,s,:] n2x = self.x[s,:,:] for en,fn,xn in zip(n2e,n2f,n2x): vx = dot(v, xn) for em,fm,xm in zip(n2e,n2f,n2x): vxx_a = dot(vx, xm.T) for iw,comega in enumerate(ww): zvxx_a[iw,:] = vxx_a * (fn - fm)/ (comega - (em - en)) rf0 += einsum('wa,b->wab', zvxx_a, vxx_a) t2 = timer() if self.verbosity>0: print(__name__, 'rf0_ref_loop', t2-t1) return rf0
[docs]def rf0_cmplx_vertex_dp(self, ww): """ Full matrix response in the basis of atom-centered product functions """ rf0 = np.zeros((len(ww), self.nprod, self.nprod), dtype=self.dtypeComplex) v_arr = self.pb.get_dp_vertex_array() zvxx_a = zeros((len(ww), self.nprod), dtype=self.dtypeComplex) for n,(en,fn) in enumerate(zip(self.ksn2e[0,0,0:self.nfermi], self.ksn2f[0, 0, 0:self.nfermi])): vx = dot(v_arr, self.xocc[0][n,:]) for m,(em,fm) in enumerate(zip(self.ksn2e[0,0,self.vstart:],self.ksn2f[0,0,self.vstart:])): if (fn - fm)<0 : break vxx_a = dot(vx, self.xvrt[0][m,:]) * self.cc_da for iw,comega in enumerate(ww): zvxx_a[iw,:] = vxx_a * (fn - fm) * ( 1.0 / (comega - (em - en)) - 1.0 / (comega + (em - en)) ) rf0 = rf0 + einsum('wa,b->wab', zvxx_a, vxx_a) return rf0
[docs]def rf0_cmplx_vertex_ac(self, ww): """ Full matrix response in the basis of atom-centered product functions """ rf0 = np.zeros((len(ww), self.nprod, self.nprod), dtype=self.dtypeComplex) v = self.pb.get_ac_vertex_array() t1 = timer() if self.verbosity>1: print(__name__, 'self.ksn2e', self.ksn2e, len(ww)) #print(self.ksn2e[0,0,0]-self.ksn2e) #print(self.ksn2f) #print(' abs(v).sum(), ww.sum(), self.nfermi, self.vstart ') #print(abs(v).sum(), ww.sum(), self.nfermi, self.vstart) zvxx_a = zeros((len(ww), self.nprod), dtype=self.dtypeComplex) for n,(en,fn) in enumerate(zip(self.ksn2e[0,0,0:self.nfermi[0]], self.ksn2f[0, 0, 0:self.nfermi[0]])): #if self.verbosity>1: print(__name__, 'n =', n) vx = dot(v, self.xocc[0][n,:]) for m,(em,fm) in enumerate(zip(self.ksn2e[0,0,self.vstart[0]:],self.ksn2f[0,0,self.vstart[0]:])): if (fn - fm)<0 : break vxx_a = dot(vx, self.xvrt[0][m,:].T) for iw,comega in enumerate(ww): zvxx_a[iw,:] = vxx_a * (fn - fm) * ( 1.0 / (comega - (em - en)) - 1.0 / (comega + (em - en)) ) rf0 += einsum('wa,b->wab', zvxx_a, vxx_a) t2 = timer() if self.verbosity>1: print(__name__, 'finished rf0', t2-t1) return rf0