Efficient way of Vectorization

Question

1 voto

Hello, I searched everywhere for the efficient explaination of vectorization, I would like to know how can we use the technique of vectorization efficiently if we have this kind of problem

clc
clear
close all
n=1000;
C1=zeros(n,n);
C2=zeros(n,n);
A=rand(n,n);
B=rand(n,n);
tic
for i=2:n-1
    for j=2:n-1
        C1(i,j) = (A(i,j)*B(i,j-1) + A(i-1,j)*B(i+1,j-1))/(A(i,j+1)*B(i+1,j));
    end
end
toc
Elapsed time is 0.049266 seconds.
%VECTORIZATION
tic
C2(2:n-1,2:n-1)=(A(2:n-1,2:n-1).*B(2:n-1,1:n-2) + A(1:n-2,2:n-1).*B(3:n,1:n-2))./(A(2:n-1,3:n).*B(3:n,2:n-1));
toc;
Elapsed time is 0.014871 seconds.
norm(C1-C2)
ans = 0

This is a very basic example, although it is showing the improvement after vectorization but not that enough. If I make more divison and multiplication in the same function, "vectorization" will become even worse than "for loop". If anybody have any suggestion regarding this, it would be very helpful for me.

8 comentarios
Mostrar 6 comentarios más antiguos Ocultar 6 comentarios más antiguos

Nadeem Ahmed el 30 de Nov. de 2022

Abrir en MATLAB Online

function [uxy,uxt,uyt,vxy,vxt,vyt,Pxt,Pyt] = MAIN(v1,v2,v3,v4,v5,v6,v7,v8,l11,l22,l33,l44,d11,d22,d33,d44,u11,u22,u33,u44,F11,F13,F15,F16,F17,F21,F23,F25,F26,F27,F33p,F34p,F37p,F53p,F54p,F57p,F71p,F72p,F74p,F75p,F77p,uxtold,uytold,vxtold,vytold,Pxtold,Pytold,tao,a,b,m,n,P_inv)
%VECTOR AND MATRIX INITILISATION
r1=zeros(n-1,m);
r2=zeros(n,m-1);
r3=zeros(n-1,m);
r4=zeros(n,m-1);
f1=zeros(n+1,m+1);
uxy=zeros(n+1,m+1);
vxy=zeros(n+1,m+1);
X1=zeros(n-1,m);
X2=zeros(n,m-1);
X3=zeros(n-1,m);
X4=zeros(n,m-1);
%INITIAL VALUE CONDITIONS
uxt=uxtold;
uyt=uytold;
vxt=vxtold;
vyt=vytold;
Pxt=Pxtold;
Pyt=Pytold;
rho=1;
%BOUNDARY CONDITIONS
uxt(2:n+1,1) = 0;
uxt(1:n+1,n+1) = 0;
vxt(2:n+1,1) = 0;
vxt(2:n+1,n+1) = 0;
uyt(1,2:n+1) = 0;
uyt(n+1,2:n+1) = 0;
vyt(1,2:n+1) = 0;
vyt(n+1,2:n+1) = 0;
N=n-1;
M=n-1;
tol = 1e-6;
uxtp=15*ones(n+1,m+1);
uytp=15*ones(n+1,m+1);
vxtp=15*ones(n+1,m+1);
vytp=15*ones(n+1,m+1);
iter1 = 0;
%while(norm(uxtp-uxt)>tol && norm(uytp-uyt)>tol && norm(vxtp-vxt)>tol && norm(vytp-vyt)>tol)
while(norm(uxtp-uxt)>tol || norm(uytp-uyt)>tol || norm(vxtp-vxt)>tol || norm(vytp-vyt)>tol)
    iter1 = iter1 + 1;
    uxtp=uxt;
    uytp=uyt;
    vxtp=vxt;
    vytp=vyt;
    l1=l11;
    d1=d11;
    u1=u11;
    l3=l33;
    d3=d33;
    u3=u33;
    for j=2:m+1
        for i=2:n
            r1(i-1,j-1)=-((F53p(i,j)*F71p(i,j)*uxt(i,j))/(F57p(i,j)*F77p(i,j)) ...
                (F53p(i,j)*F72p(i,j)*uxt(i,j-1))/(F57p(i,j)*F77p(i,j)) +...
                (F54p(i,j)*F71p(i+1,j)*uxt(i+1,j))/(F57p(i,j)*F77p(i+1,j)) +...
                (F54p(i,j)*F72p(i+1,j)*uxt(i+1,j-1))/(F57p(i,j)*F77p(i+1,j)) -...
                ((F53p(i,j)*v1(i,j))/(F57p(i,j))) - (F54p(i,j)*v1(i+1,j))/(F57p(i,j)) - v3(i,j));
            r3(i-1,j-1)=-((F53p(i,j)*F71p(i,j)*vxt(i,j))/(F57p(i,j)*F77p(i,j)) +...
                (F53p(i,j)*F72p(i,j)*vxt(i,j-1))/(F57p(i,j)*F77p(i,j)) +...
                (F54p(i,j)*F71p(i+1,j)*vxt(i+1,j))/(F57p(i,j)*F77p(i+1,j)) +...
                (F54p(i,j)*F72p(i+1,j)*vxt(i+1,j-1))/(F57p(i,j)*F77p(i+1,j)) -...
                ((F53p(i,j)*v4(i,j))/(F57p(i,j))) - (F54p(i,j)*v4(i+1,j))/(F57p(i,j)) - v6(i,j));
        end
    end
    %COLUMN-WISE SWEEP
    for i = 2 : N
        d1(i,:) = d1(i,:) - (l1(i,:)./d1(i-1,:)).*u1(i-1,:);
        r1(i,:) = r1(i,:) - (l1(i,:)./d1(i-1,:)).*r1(i-1,:);
        d3(i,:) = d3(i,:) - (l3(i,:)./d3(i-1,:)).*u3(i-1,:);
        r3(i,:) = r3(i,:) - (l3(i,:)./d3(i-1,:)).*r3(i-1,:);
    end
    X1(N,:) = r1(N,:)./d1(N,:);
    X3(N,:) = r3(N,:)./d3(N,:);
    for i = N - 1 : -1 : 1
        X1(i,:) = (r1(i,:) - u1(i,:).*X1(i+1,:))./d1(i,:);
        X3(i,:) = (r3(i,:) - u3(i,:).*X3(i+1,:))./d3(i,:);
    end
    uyt(2:n,2:m+1)=X1;
    vyt(2:n,2:m+1)=X3;
    
    %ROW-WISE SWEEP
    l2=l22;
    d2=d22;
    u2=u22;
    l4=l44;
    d4=d44;
    u4=u44;
    for i=2:n+1
        for j=2:m
            r2(i-1,j-1)=-(+((F33p(i,j)*F74p(i,j)*uyt(i,j))/(F37p(i,j)*F77p(i,j))) +...
                (F34p(i,j)*F74p(i,j+1)*uyt(i,j+1))/(F37p(i,j)*F77p(i,j+1)) +...
                (F33p(i,j)*F75p(i,j)*uyt(i-1,j))/(F37p(i,j)*F77p(i,j)) +...
                (F34p(i,j)*F75p(i,j+1)*uyt(i-1,j+1))/(F37p(i,j)*F77p(i,j+1)) -...
                ((F33p(i,j)*v1(i,j))/(F37p(i,j))) - (F34p(i,j)*v1(i,j+1))/(F37p(i,j)) - v2(i,j));
            r4(i-1,j-1)=-(+((F33p(i,j)*F74p(i,j)*vyt(i,j))/(F37p(i,j)*F77p(i,j))) +...
                (F34p(i,j)*F74p(i,j+1)*vyt(i,j+1))/(F37p(i,j)*F77p(i,j+1)) +...
                (F33p(i,j)*F75p(i,j)*vyt(i-1,j))/(F37p(i,j)*F77p(i,j)) +...
                (F34p(i,j)*F75p(i,j+1)*vyt(i-1,j+1))/(F37p(i,j)*F77p(i,j+1)) -...
                ((F33p(i,j)*v4(i,j))/(F37p(i,j))) - (F34p(i,j)*v4(i,j+1))/(F37p(i,j)) - v5(i,j));
        end
    end
    for j = 2 : M
        d2(:,j) = d2(:,j) - (l2(:,j)./d2(:,j-1)).*u2(:,j-1);
        r2(:,j) = r2(:,j) - (l2(:,j)./d2(:,j-1)).*r2(:,j-1);
        d4(:,j) = d4(:,j) - (l4(:,j)./d4(:,j-1)).*u4(:,j-1);
        r4(:,j) = r4(:,j) - (l4(:,j)./d4(:,j-1)).*r4(:,j-1);
    end
    X2(:,M) = r2(:,M)./d2(:,M);
    X4(:,M) = r4(:,M)./d4(:,M);
    for j = M - 1 : -1 : 1
        X2(:,j) = (r2(:,j) - u2(:,j).*X2(:,j+1))./d2(:,j);
        X4(:,j) = (r4(:,j) - u4(:,j).*X4(:,j+1))./d4(:,j);
    end
    uxt(2:n+1,2:m)=X2;
    vxt(2:n+1,2:m)=X4;    
end
% iter1
for i=2:n+1
    for j=2:m+1
        f1(i,j)=  -((+ rho*(((((uyt(i,j) - uyt(i-1,j))/(2*a))+((vxt(i,j) - vxt(i,j-1))/(2*a))))/(2*tao))))+( + ...   %D term
            (rho*(((uytold(i,j)-uytold(i-1,j))/(2*a))*((uyt(i,j)-uyt(i-1,j))/(2*a)))) + ...               %a^2 term
            (rho*(((vyt(i,j)-vyt(i-1,j))/(2*a))*((uxtold(i,j)-uxtold(i,j-1))/(2*b)))) + (rho*(((vytold(i,j)-vytold(i-1,j))/(2*a))*((uxt(i,j)-uxt(i,j-1))/(2*b)))) +...
            (rho*(((vxtold(i,j)-vxtold(i,j-1))/(2*b))*((vxt(i,j)-vxt(i,j-1))/(2*b)))));                   %b^2 term
    end
end
%RIGHT HAND SIDE OF POISSON EQUATION
W1=1;
W2=1;
PV1=zeros(1,n*(m+1));
PV2=zeros(1,n*(m+1));
for j=1:n+1
    for i=1:m+1
        if(i>1)
            if(j==1)
                PV1(W1) = - (F15(i,j+1)*f1(i,j+1))/(F11(i,j+1) + 2*F13(i,j+1)) + v7(i,j);
                W1=W1+1;
            elseif(j==m+1 && i==n+1)
                PV1(W1) = 0;%- F15(i,j)*f1(i,j) - v7(i,j);
                W1=W1+1;
            elseif(j==m+1 && i<n+1)
                PV1(W1) = - (F15(i,j)*f1(i,j))/(F11(i,j) + 2*F13(i,j)) + v7(i,j);
                W1=W1+1;
            else
                PV1(W1) = - (F15(i,j)*f1(i,j) + F16(i,j)*f1(i,j+1))/F17(i,j) + v7(i,j);
                W1=W1+1;
            end
        end
        if(j>1)
            if(i==1)
                PV2(W2) = - (F25(i+1,j)*f1(i+1,j))/(F21(i+1,j) + 2*F23(i+1,j)) + v8(i,j);
                W2=W2+1;
            elseif(i==n+1)
                PV2(W2) = - (F25(i,j)*f1(i,j))/(F21(i,j) + 2*F23(i,j)) + v8(i,j);
                W2=W2+1;
            else
                PV2(W2) = - (F25(i,j)*f1(i,j) + F26(i,j)*f1(i+1,j))/F27(i,j) + v8(i,j);
                W2=W2+1;
            end
        end
    end
end
%Complete RHS
PV = [PV1 PV2];
%MATRIX-VECTOR MULTIPLICATION
P=mtimes(P_inv,PV');
%Converting vector into variable matricies 
X1=1;
X2=(n*(m+1))+1;
for j=1:n+1
    for i=1:m+1
        if(i>1)
            if(j==1)
                Pxt(i,j) = P(X1);
                X1=X1+1;
            elseif(j==m+1)
                Pxt(i,j) = P(X1);
                X1=X1+1;
            else
                Pxt(i,j) = P(X1);
                X1=X1+1;
            end
        end
        if(j>1)
            if(i==1)
                Pyt(i,j) = P(X2);
                X2=X2+1;
            elseif(i==n+1)
                Pyt(i,j) = P(X2);
                X2=X2+1;
            else
                Pyt(i,j) = P(X2);
                X2=X2+1;
            end
        end
    end
end
%Explicit calculation of U and V velocity
for j=2:m+1
    for i=2:n+1
        uxy(i,j)=(F71p(i,j)*uxt(i,j) + F72p(i,j)*uxt(i,j-1) + F74p(i,j)*uyt(i,j) + F75p(i,j)*uyt(i-1,j))/F77p(i,j)  - v1(i,j);
    end
end
for i=2:n+1
    for j=2:m+1
        vxy(i,j)=(F71p(i,j)*vxt(i,j) + F72p(i,j)*vxt(i,j-1) + F74p(i,j)*vyt(i,j) + F75p(i,j)*vyt(i-1,j))/F77p(i,j)  - v4(i,j);
    end
end
%toc
end

This is the code which I am trying to optimize, please let me know if you have any suggestions of this type of coding.

Nadeem Ahmed el 30 de Nov. de 2022

Abrir en MATLAB Online

clc
clear
close all
n=50;
m=50;
uytold=rand(n+1,m+1);
uxtold=rand(n+1,m+1);
vytold=rand(n+1,m+1);
vxtold=rand(n+1,m+1);
Pxtold=rand(n+1,m+1);
Pytold=rand(n+1,m+1);
v1=rand(n+1,m+1);
v2=rand(n+1,m+1);
v3=rand(n+1,m+1);
v4=rand(n+1,m+1);
v5=rand(n+1,m+1);
v6=rand(n+1,m+1);
v7=rand(n+1,m+1);
v8=rand(n+1,m+1);
l11=rand(n-1,n);
l22=rand(n,n-1);
l33=rand(n-1,n);
l44=rand(n,n-1);
d11=rand(n-1,n);
d22=rand(n,n-1);
d33=rand(n-1,n);
d44=rand(n,n-1);
u11=rand(n-1,n);
u22=rand(n,n-1);
u33=rand(n-1,n);
u44=rand(n,n-1);
F11=rand(n+1,m+1);
F13=rand(n+1,m+1);
F15=rand(n+1,m+1);
F16=rand(n+1,m+1);
F17=rand(n+1,m+1);
F21=rand(n+1,m+1);
F23=rand(n+1,m+1);
F25=rand(n+1,m+1);
F26=rand(n+1,m+1);
F27=rand(n+1,m+1);
F33p=rand(n+1,m+1);
F34p=rand(n+1,m+1);
F37p=rand(n+1,m+1);
F53p=rand(n+1,m+1);
F54p=rand(n+1,m+1);
F57p=rand(n+1,m+1);
F71p=rand(n+1,m+1);
F72p=rand(n+1,m+1);
F74p=rand(n+1,m+1);
F75p=rand(n+1,m+1);
F77p=rand(n+1,m+1);
a=0.5;
b=0.5;
tao=0.01;
P_inv=rand(2*n*(m+1),2*n*(m+1));
tic
[uxy,uxt,uyt,vxy,vxt,vyt,Pxt,Pyt] = MAIN(v1,v2,v3,v4,v5,v6,v7,v8,l11,l22,l33,l44,d11,d22,d33,d44,u11,u22,u33,u44,F11,F13,F15,F16,F17,F21,F23,F25,F26,F27,F33p,F34p,F37p,F53p,F54p,F57p,F71p,F72p,F74p,F75p,F77p,uxtold,uytold,vxtold,vytold,Pxtold,Pytold,tao,a,b,m,n,P_inv);
toc
Elapsed time is 0.070027 seconds.
function [uxy,uxt,uyt,vxy,vxt,vyt,Pxt,Pyt] = MAIN(v1,v2,v3,v4,v5,v6,v7,v8,l11,l22,l33,l44,d11,d22,d33,d44,u11,u22,u33,u44,F11,F13,F15,F16,F17,F21,F23,F25,F26,F27,F33p,F34p,F37p,F53p,F54p,F57p,F71p,F72p,F74p,F75p,F77p,uxtold,uytold,vxtold,vytold,Pxtold,Pytold,tao,a,b,m,n,P_inv)
%VECTOR AND MATRIX INITILISATION
r1=zeros(n-1,m);
r2=zeros(n,m-1);
r3=zeros(n-1,m);
r4=zeros(n,m-1);
f1=zeros(n+1,m+1);
uxy=zeros(n+1,m+1);
vxy=zeros(n+1,m+1);
X1=zeros(n-1,m);
X2=zeros(n,m-1);
X3=zeros(n-1,m);
X4=zeros(n,m-1);
%INITIAL VALUE CONDITIONS
uxt=uxtold;
uyt=uytold;
vxt=vxtold;
vyt=vytold;
Pxt=Pxtold;
Pyt=Pytold;
rho=1;
%BOUNDARY CONDITIONS ARE ZERO FOR PRECONDITIONER
uxt(2:n+1,1) = 0;
uxt(1:n+1,n+1) = 0;
vxt(2:n+1,1) = 0;
vxt(2:n+1,n+1) = 0;
uyt(1,2:n+1) = 0;
uyt(n+1,2:n+1) = 0;
vyt(1,2:n+1) = 0;
vyt(n+1,2:n+1) = 0;
N=n-1;
M=n-1;
for kk=1:10
    
    %COLUMN-WISE SWEEP
    l1=l11;
    d1=d11;
    u1=u11;
    l3=l33;
    d3=d33;
    u3=u33;
    for j=2:m+1
        for i=2:n
            r1(i-1,j-1)=-((F53p(i,j)*F71p(i,j)*uxt(i,j))/(F57p(i,j)*F77p(i,j)) + (F53p(i,j)*F72p(i,j)*uxt(i,j-1))/(F57p(i,j)*F77p(i,j)) + (F54p(i,j)*F71p(i+1,j)*uxt(i+1,j))/(F57p(i,j)*F77p(i+1,j)) + (F54p(i,j)*F72p(i+1,j)*uxt(i+1,j-1))/(F57p(i,j)*F77p(i+1,j)) - ((F53p(i,j)*v1(i,j))/(F57p(i,j))) - (F54p(i,j)*v1(i+1,j))/(F57p(i,j)) - v3(i,j));
            r3(i-1,j-1)=-((F53p(i,j)*F71p(i,j)*vxt(i,j))/(F57p(i,j)*F77p(i,j)) + (F53p(i,j)*F72p(i,j)*vxt(i,j-1))/(F57p(i,j)*F77p(i,j)) + (F54p(i,j)*F71p(i+1,j)*vxt(i+1,j))/(F57p(i,j)*F77p(i+1,j)) + (F54p(i,j)*F72p(i+1,j)*vxt(i+1,j-1))/(F57p(i,j)*F77p(i+1,j)) - ((F53p(i,j)*v4(i,j))/(F57p(i,j))) - (F54p(i,j)*v4(i+1,j))/(F57p(i,j)) - v6(i,j));
        end
    end
    for i = 2 : N
        d1(i,:) = d1(i,:) - (l1(i,:)./d1(i-1,:)).*u1(i-1,:);
        r1(i,:) = r1(i,:) - (l1(i,:)./d1(i-1,:)).*r1(i-1,:);
        d3(i,:) = d3(i,:) - (l3(i,:)./d3(i-1,:)).*u3(i-1,:);
        r3(i,:) = r3(i,:) - (l3(i,:)./d3(i-1,:)).*r3(i-1,:);
    end
    X1(N,:) = r1(N,:)./d1(N,:);
    X3(N,:) = r3(N,:)./d3(N,:);
    for i = N - 1 : -1 : 1
        X1(i,:) = (r1(i,:) - u1(i,:).*X1(i+1,:))./d1(i,:);
        X3(i,:) = (r3(i,:) - u3(i,:).*X3(i+1,:))./d3(i,:);
    end
    uyt(2:n,2:m+1)=X1;
    vyt(2:n,2:m+1)=X3;
    
    %ROW-WISE SWEEP
    l2=l22;
    d2=d22;
    u2=u22;
    l4=l44;
    d4=d44;
    u4=u44;
    for i=2:n+1
        for j=2:m
            r2(i-1,j-1)=-(+((F33p(i,j)*F74p(i,j)*uyt(i,j))/(F37p(i,j)*F77p(i,j))) + (F34p(i,j)*F74p(i,j+1)*uyt(i,j+1))/(F37p(i,j)*F77p(i,j+1)) + (F33p(i,j)*F75p(i,j)*uyt(i-1,j))/(F37p(i,j)*F77p(i,j)) + (F34p(i,j)*F75p(i,j+1)*uyt(i-1,j+1))/(F37p(i,j)*F77p(i,j+1)) - ((F33p(i,j)*v1(i,j))/(F37p(i,j))) - (F34p(i,j)*v1(i,j+1))/(F37p(i,j)) - v2(i,j));
            r4(i-1,j-1)=-(+((F33p(i,j)*F74p(i,j)*vyt(i,j))/(F37p(i,j)*F77p(i,j))) + (F34p(i,j)*F74p(i,j+1)*vyt(i,j+1))/(F37p(i,j)*F77p(i,j+1)) + (F33p(i,j)*F75p(i,j)*vyt(i-1,j))/(F37p(i,j)*F77p(i,j)) + (F34p(i,j)*F75p(i,j+1)*vyt(i-1,j+1))/(F37p(i,j)*F77p(i,j+1)) - ((F33p(i,j)*v4(i,j))/(F37p(i,j))) - (F34p(i,j)*v4(i,j+1))/(F37p(i,j)) - v5(i,j));
        end
    end
    for j = 2 : M
        d2(:,j) = d2(:,j) - (l2(:,j)./d2(:,j-1)).*u2(:,j-1);
        r2(:,j) = r2(:,j) - (l2(:,j)./d2(:,j-1)).*r2(:,j-1);
        d4(:,j) = d4(:,j) - (l4(:,j)./d4(:,j-1)).*u4(:,j-1);
        r4(:,j) = r4(:,j) - (l4(:,j)./d4(:,j-1)).*r4(:,j-1);
    end
    X2(:,M) = r2(:,M)./d2(:,M);
    X4(:,M) = r4(:,M)./d4(:,M);
    for j = M - 1 : -1 : 1
        X2(:,j) = (r2(:,j) - u2(:,j).*X2(:,j+1))./d2(:,j);
        X4(:,j) = (r4(:,j) - u4(:,j).*X4(:,j+1))./d4(:,j);
    end
    uxt(2:n+1,2:m)=X2;
    vxt(2:n+1,2:m)=X4;    
end
% iter1
for i=2:n+1
    for j=2:m+1
        f1(i,j)=  -((+ rho*(((((uyt(i,j) - uyt(i-1,j))/(2*a))+((vxt(i,j) - vxt(i,j-1))/(2*a))))/(2*tao))))+( + ...   %D term
            (rho*(((uytold(i,j)-uytold(i-1,j))/(2*a))*((uyt(i,j)-uyt(i-1,j))/(2*a)))) + ...               %a^2 term
            (rho*(((vyt(i,j)-vyt(i-1,j))/(2*a))*((uxtold(i,j)-uxtold(i,j-1))/(2*b)))) + (rho*(((vytold(i,j)-vytold(i-1,j))/(2*a))*((uxt(i,j)-uxt(i,j-1))/(2*b)))) +...
            (rho*(((vxtold(i,j)-vxtold(i,j-1))/(2*b))*((vxt(i,j)-vxt(i,j-1))/(2*b)))));                   %b^2 term
    end
end
%RIGHT HAND SIDE OF POISSON EQUATION
W1=1;
W2=1;
PV1=zeros(1,n*(m+1));
PV2=zeros(1,n*(m+1));
for j=1:n+1
    for i=1:m+1
        if(i>1)
            if(j==1)
                PV1(W1) = - (F15(i,j+1)*f1(i,j+1))/(F11(i,j+1) + 2*F13(i,j+1)) + v7(i,j);
                W1=W1+1;
            elseif(j==m+1 && i==n+1)
                PV1(W1) = 0;%- F15(i,j)*f1(i,j) - v7(i,j);
                W1=W1+1;
            elseif(j==m+1 && i<n+1)
                PV1(W1) = - (F15(i,j)*f1(i,j))/(F11(i,j) + 2*F13(i,j)) + v7(i,j);
                W1=W1+1;
            else
                PV1(W1) = - (F15(i,j)*f1(i,j) + F16(i,j)*f1(i,j+1))/F17(i,j) + v7(i,j);
                W1=W1+1;
            end
        end
        if(j>1)
            if(i==1)
                PV2(W2) = - (F25(i+1,j)*f1(i+1,j))/(F21(i+1,j) + 2*F23(i+1,j)) + v8(i,j);
                W2=W2+1;
            elseif(i==n+1)
                PV2(W2) = - (F25(i,j)*f1(i,j))/(F21(i,j) + 2*F23(i,j)) + v8(i,j);
                W2=W2+1;
            else
                PV2(W2) = - (F25(i,j)*f1(i,j) + F26(i,j)*f1(i+1,j))/F27(i,j) + v8(i,j);
                W2=W2+1;
            end
        end
    end
end
%Complete RHS
PV = [PV1 PV2];
%MATRIX-VECTOR MULTIPLICATION
P=mtimes(P_inv,PV');
%Converting vector into variable matricies
X1=1;
X2=(n*(m+1))+1;
for j=1:n+1
    for i=1:m+1
        if(i>1)
            if(j==1)
                Pxt(i,j) = P(X1);
                X1=X1+1;
            elseif(j==m+1)
                Pxt(i,j) = P(X1);
                X1=X1+1;
            else
                Pxt(i,j) = P(X1);
                X1=X1+1;
            end
        end
        if(j>1)
            if(i==1)
                Pyt(i,j) = P(X2);
                X2=X2+1;
            elseif(i==n+1)
                Pyt(i,j) = P(X2);
                X2=X2+1;
            else
                Pyt(i,j) = P(X2);
                X2=X2+1;
            end
        end
    end
end
%Explicit calculation of U and V velocity
for j=2:m+1
    for i=2:n+1
        uxy(i,j)=(F71p(i,j)*uxt(i,j) + F72p(i,j)*uxt(i,j-1) + F74p(i,j)*uyt(i,j) + F75p(i,j)*uyt(i-1,j))/F77p(i,j)  - v1(i,j);
    end
end
for i=2:n+1
    for j=2:m+1
        vxy(i,j)=(F71p(i,j)*vxt(i,j) + F72p(i,j)*vxt(i,j-1) + F74p(i,j)*vyt(i,j) + F75p(i,j)*vyt(i-1,j))/F77p(i,j)  - v4(i,j);
    end
end
%toc
end

Dear @Mike Croucher, all the variables are defined as the random matrices and vectors. Now you can run the code.

Mike Croucher el 30 de Nov. de 2022

Thanks. So for N,M=50, the code runs in 0.01 seconds on my machine.

Increasing to N,M=100, the code runs in 0.22 seconds

Trying N,M=200, I run out of memory on my 32Gb laptop.

What values of N and M are you interested in and how fast do you need the code to be?

Nadeem Ahmed el 30 de Nov. de 2022

I will use this function for N,M>80 and I need to call this function more than thousands times therefore it should be of negligible time. Any suggestions are welcome.

Iniciar sesión para comentar.

Iniciar sesión para responder a esta pregunta.

Follow Question

Answer 1

Matt J el 30 de Nov. de 2022

Editada: Matt J el 30 de Nov. de 2022

Abrir en MATLAB Online

1 voto

Unfortunately, this is a situation where the for loop is the fastest option. This is because vectorized solution does much more memory allocation than it should. I have raised this issue with MathWorks staff, but am not sure what is being done on it.

function test
n=1000;
C1=zeros(n,n);
C2=zeros(n,n);
A=rand(n,n);
B=rand(n,n);
timeit(@()method1)
timeit(@()method2)

ans =

0.0161

ans =

0.0210

function method1
 for i=2:n-1
    for j=2:n-1
        C1(i,j) = (A(i,j)*B(i,j-1) + A(i-1,j)*B(i+1,j-1))/(A(i,j+1)*B(i+1,j));
    end
 end
end
function method2
    C2(2:n-1,2:n-1)=(A(2:n-1,2:n-1).*B(2:n-1,1:n-2) + A(1:n-2,2:n-1).*B(3:n,1:n-2))./(A(2:n-1,3:n).*B(3:n,2:n-1));
end
end

9 comentarios
Mostrar 7 comentarios más antiguos Ocultar 7 comentarios más antiguos

Bruno Luong el 30 de Nov. de 2022

Editada: Bruno Luong el 30 de Nov. de 2022

I don't think the problem is allocating memory, but actually indexing with truncation index, which requires elements in memory to be rearranged.

I'm not surprised that to make a vectorize code as fast as the for-loop requires a big development of the internal engine (for instant using meta data that describe subarray of an array without copying the data).

Indexing is always the bottleneck of MATLAB.

Matt J el 30 de Nov. de 2022

Abrir en MATLAB Online

I don't think the problem is allocating memory, but actually indexing with truncation index

Not sure what a "truncation index" refers to here. In any case, the subsref operations are definitely to blame, since when we revise the test with the indexing done offline, the vectorized version is much more competitive with the loops:

function test
n=1000;
C1=zeros(n,n);
C2=zeros(n,n);
A=rand(n,n);
B=rand(n,n);
[Q1,Q2,Q3,Q4,Q5,Q6]=...
deal( A(2:n-1,2:n-1) , B(2:n-1,1:n-2), A(1:n-2,2:n-1),...
      B(3:n,1:n-2), A(2:n-1,3:n), B(3:n,2:n-1) );
timeit(@()method1)
timeit(@()method2)

ans =

0.0149

ans =

0.0051

    function method1
    
     for i=2:n-1
        for j=2:n-1
            C1(i,j) = (A(i,j)*B(i,j-1) + A(i-1,j)*B(i+1,j-1))/(A(i,j+1)*B(i+1,j));
        end
     end
    
    end
    
    function method2
        C2(2:n-1,2:n-1)=(Q1.*Q2 + Q3.*Q4)./(Q5.*Q6);
    end
end

Iniciar sesión para comentar.

Answer 2

Mike Croucher el 30 de Nov. de 2022

Abrir en MATLAB Online

4 votos

Switch the order of the loops around. It will be faster because you'll be operating on the matrix column-wise

test
loops
ans = 0.0884
loops 2
ans = 0.0203
vector
ans = 0.0928
function test
n=2000;
C1=zeros(n,n);
C2=zeros(n,n);
A=rand(n,n);
B=rand(n,n);
disp('loops')
timeit(@()loops)
disp('loops 2')
timeit(@()loops2)
disp('vector')
timeit(@()vector)
function loops
 for i=2:n-1
    for j=2:n-1
        C1(i,j) = (A(i,j)*B(i,j-1) + A(i-1,j)*B(i+1,j-1))/(A(i,j+1)*B(i+1,j));
    end
 end
end
function loops2
    for j=2:n-1
    for i=2:n-1
        C1(i,j) = (A(i,j)*B(i,j-1) + A(i-1,j)*B(i+1,j-1))/(A(i,j+1)*B(i+1,j));
    end
 end
end
function vector
    C2(2:n-1,2:n-1)=(A(2:n-1,2:n-1).*B(2:n-1,1:n-2) + A(1:n-2,2:n-1).*B(3:n,1:n-2))./(A(2:n-1,3:n).*B(3:n,2:n-1));
end
end

4 comentarios
Mostrar 2 comentarios más antiguos Ocultar 2 comentarios más antiguos

Nadeem Ahmed el 30 de Nov. de 2022

Yes, you are right, beacsue I changed all my for loop but still I didn't get any improvement.

Dyuman Joshi el 30 de Nov. de 2022

This is neat, @Mike Croucher! Learned something new today :D

Iniciar sesión para comentar.

Efficient way of Vectorization

8 comentarios
Mostrar 6 comentarios más antiguos Ocultar 6 comentarios más antiguos

Respuesta aceptada

9 comentarios
Mostrar 7 comentarios más antiguos Ocultar 7 comentarios más antiguos

Más respuestas (1)

4 comentarios
Mostrar 2 comentarios más antiguos Ocultar 2 comentarios más antiguos

Categorías

Productos

Versión

Etiquetas

Community Treasure Hunt

Efficient way of Vectorization

8 comentarios Mostrar 6 comentarios más antiguos Ocultar 6 comentarios más antiguos

Respuesta aceptada

9 comentarios Mostrar 7 comentarios más antiguos Ocultar 7 comentarios más antiguos

Más respuestas (1)

4 comentarios Mostrar 2 comentarios más antiguos Ocultar 2 comentarios más antiguos

Categorías

Productos

Versión

Etiquetas

Ver también

Community Treasure Hunt

8 comentarios
Mostrar 6 comentarios más antiguos Ocultar 6 comentarios más antiguos

9 comentarios
Mostrar 7 comentarios más antiguos Ocultar 7 comentarios más antiguos

4 comentarios
Mostrar 2 comentarios más antiguos Ocultar 2 comentarios más antiguos