function [res, dX, dY] = register_ncc2(src, trg, opt)
%%% well, I guess this function is for non-rigid registration

if ~exist('opt','var') % Default parameters 
    max_it = 1000;
    lambda = 1;  % Step size of the velocity update
    sigma = 10; % Size of Gaussian smoothing for images
    sigma_p = 1.5; % Size of Gaussian for smoothing deformation field
    multiRes = 2;
else    % User inputed parameters
    max_it = opt.max_it;
    lambda = opt.lambda;  % Step size of the velocity update
    sigma = opt.sigma; % Size of Gaussian smoothing for images
    sigma_p = opt.sigma_p; % Size of Gaussian for smoothing deformation field
    multiRes = opt.multiRes;
end

% initialize parameters
src_original = src;
[M,N] = size(src);
[X_0,Y_0] = meshgrid(1:N,1:M);
Ux = X_0*0;
Uy = Y_0*0;
X_c = X_0;
Y_c = Y_0;

% blurr images a little
hG = fspecial('gaussian',[9,9],sigma); %filter for blurring
hGp = fspecial('gaussian',[9,9],sigma_p); %filter for blurring penalty
src = imfilter(src,hG);
trg = imfilter(trg,hG);

for r = 0:1:multiRes

    rt = multiRes-r;
    src_t = imresize(src,(1/2)^(rt),'bilinear');
    trg_t = imresize(trg,(1/2)^(rt),'bilinear');
    
    %create linear grid
    [M,N] = size(src_t);
    [X_0,Y_0] = meshgrid(1:N,1:M);
    
    %upsample deformation
    Ux = imresize(Ux,[M,N],'bilinear')*(r+1);
    Uy = imresize(Uy,[M,N],'bilinear')*(r+1);
        
    t_norm_sqrd = sum(sum(trg_t.^2));
%     figure(11);imagesc(src_t);truesize;colormap gray;
%     figure(12);imagesc(trg_t);truesize;colormap gray;

    % compute normalized cc squared
    X_c = X_0 - Ux;
    Y_c = Y_0 - Uy;
    src_temp = interp2(src_t,X_c,Y_c,'linear',0);
    [src_temp] = NaNFix(src_temp);
    SfT = sum(sum(src_temp.*trg_t));
    s_norm_sqrd = sum(sum(src_temp.^2));
    current_ncc = (SfT.^2/(s_norm_sqrd*t_norm_sqrd))
    [dIx,dIy] = gradient(src_temp);

    stop = 0;
    iterations = 0;
    while (stop < 1)

        iterations = iterations+1;
        figure(11);imagesc(src_temp);truesize;colormap gray;

        % Compute first derivative
        dUx  =  (2/(s_norm_sqrd^2*t_norm_sqrd))*SfT*dIx.*(SfT*src_temp - s_norm_sqrd*trg_t);
        dUy  =  (2/(s_norm_sqrd^2*t_norm_sqrd))*SfT*dIy.*(SfT*src_temp - s_norm_sqrd*trg_t);

        % update deformation
        Uxt = Ux + lambda*dUx; %+ plus since it is maximization
        Uyt = Uy + lambda*dUy;
        Uxt = imfilter(Uxt,hGp,'same');
        Uyt = imfilter(Uyt,hGp,'same');
        Uxt(:,1) = 0;Uxt(:,N) = 0;Uxt(1,:) = 0;Uxt(M,:) = 0;
        Uyt(:,1) = 0;Uyt(:,N) = 0;Uyt(1,:) = 0;Uyt(M,:) = 0;
        
        X_ct = X_0 - Uxt;
        Y_ct = Y_0 - Uyt;

        % update image and compute derivative (just to test)
        src_tempt = interp2(src_t,X_ct,Y_ct,'linear',0);
        [src_tempt] = NaNFix(src_tempt);
        s_norm_sqrdt = sum(sum(src_tempt.^2));
        current_ncct = SfT.^2/(s_norm_sqrdt*t_norm_sqrd)

        % check for improvement
        if (current_ncct >= (current_ncc-0.001))
            X_c = X_ct;
            Y_c = Y_ct;
            Ux = Uxt;
            Uy = Uyt;
            src_temp = src_tempt;
            s_norm_sqrd = s_norm_sqrdt;
            current_ncc = current_ncct;
            SfT = sum(sum(src_temp.*trg_t));
            [dIx,dIy] = gradient(src_temp);

        else
            stop = 1;
        end

        if ( (iterations > max_it))
            stop =1;
        end
    end %while loop
    
end%resolution loop
    
dX = X_c;
dY = Y_c;
res = interp2(src_original,X_c,Y_c);
[res] = NaNFix(res);

% figure; imshow(res,[]); hold on;
% plot(dX(:,1:10:size(res,2)),dY(:,1:10:size(res,2)),'g');
% plot(dX(1:10:size(res,1),:)',dY(1:10:size(res,1),:)','g');hold off