function [DeformedSource,dX, dY, optimal_ncc] = nr_ncc3(source, target, opt)

% function to register the source to the target image by optimizing the NCC
% between them. Utilizes a mask W, which is ones inside region of
% integration and zero outside.
%
% Author: G.K. Rohde

DISPLAY = 0;
mag = 2;

%% get initial parameters

sigma=opt.sigma; %std of basis function for modeling deformations
sigma_i=opt.sigma_i; %std of basis functions for computing image derivatives
max_pix_displacement=opt.max_pix_displacement; %maximum pixel displacement in one gradient descent iteration
n_stepsizes=opt.n_stepsizes; %number of step sizes to try within one gradient descent iteration
max_iterations = opt.max_iterations; %maximum number of gradient descent iterations to use
tolerance = opt.tolerance ;

src_original = source;
trg_original = target;
src = source;
trg = target;

if (DISPLAY>0)
   figure(1); set(gcf,'Color',[1 1 1])
   imshow(src,[],'Border','tight','InitialMagnification',mag*100);title('Source')
   figure(2); set(gcf,'Color',[1 1 1])
   imshow(trg,[],'Border','tight','InitialMagnification',mag*100);title('Target')   
end

%% initialize filters for computing deformations, and derivatives

[Xt,Yt] = meshgrid(-2*sigma:2*sigma,-2*sigma:2*sigma);
[Xti,Yti] = meshgrid(-2*sigma_i:2*sigma_i,-2*sigma_i:2*sigma_i);
[phi,phi_x,phi_y] = gaussian_bf(Xt,Yt,sigma);
[iphi,iphi_x,iphi_y] = gaussian_bf(Xti,Yti,sigma_i);
phi_x = fliplr(phi_x); %so can use imfilter command (which uses correlation)
phi_y = flipud(phi_y); %so can use imfilter command
iphi_x = fliplr(iphi_x); %so can use imfilter command (which uses correlation)
iphi_y = flipud(iphi_y); %so can use imfilter command

%% set up initial grid

[M,N,K] = size(src); % K = 3 for RGB
[X_0,Y_0] = meshgrid(1:N,1:M);
Cu = X_0*0; %x-coefficients of displacement field
Cv = Y_0*0; %y-coefficients of displacement field
U = imfilter(Cu,phi); %x displacement field
V = imfilter(Cv,phi); %y displacement field
Xc = X_0 - U; 
Yc = Y_0 - V;

%% pre compute norm of each channel in target image

for i=1:K %compute norm of each channel in target image   
    n_wtrg(i) = sqrt(sum(sum((trg(:,:,i)).^2)));
end

%% compute initial NCC
c_src = src;
c_ncc = 0;
for i=1:K %deform each image channel
    c_src(:,:,i) = interp2(src(:,:,i),Xc,Yc,'*linear',0);
    num = (sum(sum( (c_src(:,:,i)).*trg(:,:,i)  )))^2;
    den = n_wtrg(i)^2*(sum(sum( (c_src(:,:,i)).^2 )));    
    c_ncc = c_ncc+ 1/K*num/den; %current ncc value
end
c_ncc


%% optimize using gradient ascent
STOP = 0;
n_iter = 0;
while (STOP ==0)
    
    % compute necessary parameters
    U = imfilter(Cu,phi); %x displacement field
    V = imfilter(Cv,phi); %y displacement field
    Xc = X_0 - U;
    Yc = Y_0 - V;
    
    for i=1:K % update image deform each image channel
        c_src(:,:,i) = interp2(src(:,:,i),Xc,Yc,'*linear',0);
    end
    
    if (DISPLAY>0)
        figure(3); set(gcf,'Color',[1 1 1])
        imshow(c_src,[],'Border','tight','InitialMagnification',mag*100);title('target')
    end
    
    %compute image gradients (remember to multiply by its own mask so that
    %artificial edges do not play a role).
    for i = 1:K
        dSdx(:,:,i) = imfilter(c_src(:,:,i),iphi_x);
        dSdy(:,:,i) = imfilter(c_src(:,:,i),iphi_y);
    end

    
    % compute gradient
    dPsi_dCu = dSdx(:,:,1)*0;
    dPsi_dCv = dSdy(:,:,1)*0;

    for i=1:K %average gradients for each image channel
        One = (sum(sum( (c_src(:,:,i)).*trg(:,:,i)  )))^2;
        Two = n_wtrg(i)^2;
        Three = sum(sum( (c_src(:,:,i)).^2 ));
        
        One_u = -2*sqrt(One)*imfilter((trg(:,:,i)).*dSdx(:,:,i),phi);
        One_v = -2*sqrt(One)*imfilter((trg(:,:,i)).*dSdy(:,:,i),phi);
        Three_u = -2*imfilter(c_src(:,:,i).*dSdx(:,:,i),phi);
        Three_v = -2*imfilter(c_src(:,:,i).*dSdy(:,:,i),phi);
        
        dPsi_dCu = dPsi_dCu+1/K*(One_u.*Two*Three - One*Two*Three_u)./( (Two*Three)^2 );
        dPsi_dCv = dPsi_dCv+1/K*(One_v.*Two*Three - One*Two*Three_v)./( (Two*Three)^2 );
    end
        
    % do a rough line search
    conv_dPsi_dCu = imfilter(dPsi_dCu,phi);
    conv_dPsi_dCv = imfilter(dPsi_dCv,phi);
    
    pix_displacement = sqrt( conv_dPsi_dCu.^2 + conv_dPsi_dCv.^2   );
    
    c_pd = max(pix_displacement(:));
    max_lambda = max_pix_displacement/c_pd;
    step_size_array = [linspace(0,max_lambda,n_stepsizes)];

    for s = 1:length(step_size_array)
        ss = step_size_array(s);
        Xct = X_0 - (U+ss*conv_dPsi_dCu);
        Yct = Y_0 - (V+ss*conv_dPsi_dCv); 

        % update image
        for i=1:K %deform each image channel
            tc_src(:,:,i) = interp2(src(:,:,i),Xct,Yct,'*linear',0);
        end

        tc_ncc = 0;
        for i=1:K 
            One = (sum(sum( (tc_src(:,:,i)).*trg(:,:,i)  )))^2;
            Two = n_wtrg(i)^2;
            Three = sum(sum( (tc_src(:,:,i)).^2 ));
            tc_ncc = tc_ncc + 1/K*One/(Two*Three);
        end    
        s_array(s) = tc_ncc;    
    end

    qw_max = find(s_array == max(s_array)); %index of largest cost function value
    t_c_ncc = s_array(qw_max(1)); %should always be greater than or equal to previous value
    
    % update coefficients (optimization variables)
%     if (qw_max(1) ~= 1)
%         ss = step_size_array(qw_max(1));
%         Cu = Cu + ss*dPsi_dCu;
%         Cv = Cv + ss*dPsi_dCv;
%     end
        
    % update coefficients (optimization variables)        
    if (qw_max(1) ~= 1)
        
        ss = step_size_array(qw_max(1));
        Cu = Cu + ss*dPsi_dCu;
        Cv = Cv + ss*dPsi_dCv;
        
        if (DISPLAY >0)
            c_ncc
        end
        
    end
    
    %check for improvement
    improvement = t_c_ncc - c_ncc %should be negative
    tolerance
    if (improvement < tolerance)
        STOP = 1; %quit iterating
        c_ncc = t_c_ncc;
    else   
        c_ncc = t_c_ncc;
    end
    
    %stop if maximum number of iterations reached
    n_iter = n_iter+1;
    if (n_iter > max_iterations)
        STOP = 1;
    end
        
end


%% assign outputs
DeformedSource = c_src;
U = imfilter(Cu,phi); %x displacement field
V = imfilter(Cv,phi); %y displacement field
dX = X_0 - U;
dY = Y_0 - V;
optimal_ncc = c_ncc;
