#undef DOLD

#ifdef DOLD

void acde_2d_b(float **uc, float **ucb, float **up, float **upb,
	       float **csq, float ***csqb,
	       int *s, int *e, int *z,
	       float **c, int k, 
	       int *lbc, int *rbc,
	       float *lap) {
  
  int i0, i1, i2, ioff;
  int s0 = s[0];
  int e0 = e[0];
  float c0 = c[0][0] + c[1][0];
  
  /* boundary conditions - note that uc[-1][i]=0 etc. */
  if (rbc[0])
    for (i1 = e[1]; i1 > s[1]-1; --i1)
      for (ioff = k-2; ioff > -1; --ioff) {
	upb[i1][e[0] - ioff] = upb[i1][e[0] - ioff] - upb[i1][e[0] + ioff + 2];
	upb[i1][e[0] + ioff + 2] = 0.0;
      }

  if (lbc[0])
    for (i1 = e[1]; i1 > s[1]-1; --i1)
      for (ioff = k-2; ioff > -1; --ioff) {
	upb[i1][s[0] + ioff] = upb[i1][s[0] + ioff] - upb[i1][s[0] - ioff - 2];
	upb[i1][s[0] - ioff - 2] = 0.0;
      }


  if (rbc[1])
    for (ioff = k-2; ioff > -1; --ioff)
#pragma ivdep
      for (i0 = e0; i0 > s0-1; --i0) {
	upb[e[1] - ioff][i0] = upb[e[1] - ioff][i0] - upb[e[1] + 2 + ioff][i0];
	upb[e[1] + 2 + ioff][i0] = 0.0;
      }

  if (lbc[1])
    for (ioff = k-2; ioff > -1; --ioff)
#pragma ivdep
      for (i0 = e0; i0 > s0-1; --i0) {
	upb[s[1] + ioff][i0] = upb[s[1] + ioff][i0] - upb[s[1] - ioff - 2][i0];
	upb[s[1] - ioff - 2][i0] = 0.0;
      }

  /* calculate the Laplacian operator as a function of x */
  for (i1 = e[1]; i1 >= s[1]; i1--) {
#pragma ivdep
    for (i0 = s0; i0 <= e0; i0++) {
      lap[i0] = c0*uc[i1][i0];
    }

    for (ioff=1; ioff < k+1; ++ioff) {
#pragma ivdep
      for (i0 = s0; i0 <= e0; i0++) {
	lap[i0] += c[0][ioff]*(uc[i1][i0+ioff]+uc[i1][i0-ioff])
	  + c[1][ioff]*(uc[i1+ioff][i0]+uc[i1-ioff][i0]);
      } 
      /* end loop over x */
    }

    /* calculate the extended csqb operator */
    for (i2 = s[2]-z[2]; i2 <= e[2]-z[2]; i2++) {
      int locs0 = (s[0] > s[0] + 2*i2 ? s[0] : s[0] + 2*i2);
      int loce0 = (e[0] < e[0] + 2*i2 ? e[0] : e[0] + 2*i2);
      int loci2 = i2 + z[2];
#pragma ivdep          
      for (i0 = loce0; i0 >= locs0; i0--) {
	csqb[loci2][i1][i0-i2] += lap[i0-2*i2] * upb[i1][i0];
      }  /* end loop over x */
    } /* end loop over h */
    
    for (ioff=1; ioff < k+1; ++ioff)
#pragma ivdep
      for (i0 = s0; i0 <= e0; i0++) {
	float csqupb = csq[i1][i0] * upb[i1][i0];
        ucb[i1][i0 + ioff] +=  c[0][ioff]*csqupb;
	ucb[i1][i0 - ioff] +=  c[0][ioff]*csqupb;
	ucb[i1 + ioff][i0] +=  c[1][ioff]*csqupb;
	ucb[i1 - ioff][i0] +=  c[1][ioff]*csqupb;
      }

    /* extrapolate the previous time step */
#pragma ivdep
    for (i0 = e0; i0 >= s0; i0--) {            
      float csqupb = csq[i1][i0] * upb[i1][i0];
      ucb[i1][i0] += c0 * csqupb + 2.0*upb[i1][i0];
      upb[i1][i0] = -upb[i1][i0];
    } /* end loop over x */

  } /* end loop over z */

}

#else

/*        Generated by TAPENADE     (INRIA, Ecuador team)
    Tapenade 3.11 (r5902M) - 15 Dec 2015 09:00


  Differentiation of acde_2d in reverse (adjoint) mode:
   gradient     of useful results: **uc **up
   with respect to varying inputs: ***csq **uc **up
   RW status of diff variables: ***csq:out **uc:incr **up:in-out
   Plus diff mem management of: lap:in csq:in *csq:in **csq:in
                uc:in *uc:in up:in *up:in csqlap:in
*/
void acde_2d_b(float **uc, float **ucb, float **up, float **upb, float ***csq,
        float ***csqb, int *s, int *e, int *z, float **c, int k, int pbg, int 
        *lbc, int *rbc, float *lap, float *csqupb) {
  int i0, i1, i2;
  int ioff;
  int s0 = s[0];
  int e0 = e[0];
  int z2 = z[2];
  // compute central fd coeff
  float c0 = c[0][0] + c[1][0];
  float tempb;
  float tempb0;
  float tempb1;

  /* boundary conditions - note that uc[-1][i]=0 etc. */
  if (rbc[0])
    for (i1 = e[1]; i1 > s[1]-1; --i1)
      for (ioff = k-2; ioff > -1; --ioff) {
	upb[i1][e[0] - ioff] = upb[i1][e[0] - ioff] - upb[i1][e[0] + ioff + 2];
	upb[i1][e[0] + ioff + 2] = 0.0;
      }

  if (lbc[0])
    for (i1 = e[1]; i1 > s[1]-1; --i1)
      for (ioff = k-2; ioff > -1; --ioff) {
	upb[i1][s[0] + ioff] = upb[i1][s[0] + ioff] - upb[i1][s[0] - ioff - 2];
	upb[i1][s[0] - ioff - 2] = 0.0;
      }


  if (rbc[1])
    for (ioff = k-2; ioff > -1; --ioff)
#pragma ivdep
      for (i0 = e0; i0 > s0-1; --i0) {
	upb[e[1] - ioff][i0] = upb[e[1] - ioff][i0] - upb[e[1] + 2 + ioff][i0];
	upb[e[1] + 2 + ioff][i0] = 0.0;
      }

  if (lbc[1])
    for (ioff = k-2; ioff > -1; --ioff)
#pragma ivdep
      for (i0 = e0; i0 > s0-1; --i0) {
	upb[s[1] + ioff][i0] = upb[s[1] + ioff][i0] - upb[s[1] - ioff - 2][i0];
	upb[s[1] - ioff - 2][i0] = 0.0;
      }

  /* calculate the Laplacian operator as a function of x */

  for (i1 = s[1]; i1 < e[1]+1; ++i1) {
#pragma ivdep
    for (i0 = s0; i0 < e0+1; ++i0) {
      lap[i0] = c0*uc[i1][i0];
      csqupb[i0]=0.0;
    }
    for (ioff = 1; ioff < k+1; ++ioff) {
#pragma ivdep
      for (i0 = s0; i0 < e0+1; ++i0) {
	lap[i0] += c[0][ioff]*(uc[i1][i0+ioff]+uc[i1][i0-ioff]) 
	  + c[1][ioff]*(uc[i1+ioff][i0]+uc[i1-ioff][i0]);
      }
    /* end loop over x */
    }

    /* calculate the extended csqb operator */
    for (i2 = s[2]-z[2]; i2 <= e[2]-z[2]; i2++) {
      int locs0 = (s[0] > s[0] + 2*i2 ? s[0] : s[0] + 2*i2);
      int loce0 = (e[0] < e[0] + 2*i2 ? e[0] : e[0] + 2*i2);
      int loci2 = i2 + z[2];
#pragma ivdep          
      for (i0 = loce0; i0 >= locs0; i0--) {
	csqb[loci2][i1][i0-i2] += lap[i0-2*i2] * upb[i1][i0];
      }  /* end loop over x */
    } /* end loop over h */

    if (!pbg) {
      for (i2 = e[2]-z[2]; i2 > s[2]-z[2]-1; --i2) {
	int locs0 = (s[0] > s[0] + 2*i2 ? s[0] : s[0] + 2*i2);
	int loce0 = (e[0] < e[0] + 2*i2 ? e[0] : e[0] + 2*i2);
	int loci2 = i2 + z[2];
#pragma ivdep
	for (i0 = loce0; i0 > locs0-1; --i0) {
	  csqupb[i0 - 2*i2] += csq[loci2][i1][i0-i2]*upb[i1][i0];
	}
      }
      for (ioff = k; ioff > 0; --ioff) {
#pragma ivdep
	for (i0 = e0; i0 > s0-1; --i0) {
	  tempb0 = c[0][ioff]*csqupb[i0];
	  tempb1 = c[1][ioff]*csqupb[i0];
	  ucb[i1][i0 + ioff] += tempb0;
	  ucb[i1][i0 - ioff] += tempb0;
	  ucb[i1 + ioff][i0] += tempb1;
	  ucb[i1 - ioff][i0] += tempb1;
	}
      }      
#pragma ivdep
      for (i0 = e0; i0 > s0-1; --i0) {
	ucb[i1][i0] += c0*csqupb[i0]+ 2.0*upb[i1][i0];
	upb[i1][i0] = -upb[i1][i0];
      }
    } /* end non-physical bg branch */
    else {
      for (ioff = k; ioff > 0; --ioff) {
#pragma ivdep
	for (i0 = e0; i0 > s0-1; --i0) {
	  tempb = csq[z2][i1][i0]*upb[i1][i0];
	  tempb0 = c[0][ioff]*tempb;
	  tempb1 = c[1][ioff]*tempb;
	  ucb[i1][i0 + ioff] += tempb0;
	  ucb[i1][i0 - ioff] += tempb0;
	  ucb[i1 + ioff][i0] += tempb1;
	  ucb[i1 - ioff][i0] += tempb1;
	}
      }      
#pragma ivdep
      for (i0 = e0; i0 > s0-1; --i0) {
	ucb[i1][i0] += (c0*csq[z2][i1][i0] + 2.0)*upb[i1][i0];
	upb[i1][i0] = -upb[i1][i0];
      }
    } /* end physical bg branch */
  } /* end loop over z */
}

#endif