/*! \file
    \brief Jebelean algorithm
*/

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>

#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"

#include "gcd_utils.h"
#include "qseq.h"
#include "regular.h"

#include "gcd_euclid.h"
#include "gcd_jebelean.h"

#define DEBUG_JEBELEAN 0 /* 3 generates u, v, equations */
#define COUNT_STEPS    0

/** Special variant where 2 >= an >= bn >= 1. 
    PRECONDITION: q != tp
    FIXME: does not seem to bring something really more efficient.
    Use backwards routine instead?
    FIXME: R is not updated, is that right?
    TODO: use div_rem should be a better idea?
          this is an euclidean step, I may add...
*/
static inline mp_size_t jebelean_step(qseq_t lq, mp_ptr q, mp_ptr r,
				      mp_ptr a, mp_size_t an,
				      mp_ptr b, mp_size_t bn,
				      mp_ptr tp, mp_size_t tp_alloc,
				      int qgcd)
{
    mp_size_t qn;
    int cond, delta = (int)(an-bn);

#if 1
    if(delta == 1){
	/* meaning an = 2, bn = 1 */
	mp_limb_t mask = ((mp_limb_t)1) << (mp_bits_per_limb-1);
	cond = (a[1] == (mp_limb_t)1) && (b[0] & mask);
#if COUNT_QEQ1 == 1
	nbdeltaeq1++;
	if(cond) nbxor++;
#endif
    }
    else /* delta == 0 */{
	cond = (a[an-1] ^ b[an-1]) < b[an-1]; /* if yes, msb ranks are equal */
#if COUNT_QEQ1 == 1
	nbdeltaeq0++;
#endif
    }
#else
    cond = (delta == 0);
#endif
    if(cond != 0){
	/* q has length an-bn+1 == 1 */
	/* perhaps q = 1: substract and see what happens */
	/* r <- a-b */
	sub_ddmmss(r[1], r[0], a[1], a[0], b[1], b[0]);
	if(mpn_cmp(r, b, an) < 0){
	    /* newa=a-b < b => q=1 */
#if COUNT_QEQ1 == 1
	    nbqeq1++;
#endif
	    q[0] = 1;
#if DEBUG_JEBELEAN >= 3
	    printf("q:=1;\n");
#endif
	    qn = 1;
	}
	else{
	    qn = an-bn+1; /* <= 2 */
	    mpn_tdiv_qr(q, r, 0, r, an, b, bn);
	    if(qn == 2 && q[qn-1] == 0)
		qn--;
	    /* TODO: case where q[1] becomes 1 or q[1] > 0? */
	    /* q[1] += mpn_add_1(q, q, 1, 1); */
	    add_ssaaaa(q[1], q[0], q[1], q[0], 0, 1);
	    if(q[1] != 0)
		qn = 2;
#if DEBUG_JEBELEAN >= 3
	    printf("q:="); MPN_PRINT(q, qn); printf(";\n");
#endif
	}
    }
    else{
	mpn_tdiv_qr(q, r, 0, a, an, b, bn);
	qn = an-bn+1;
	MPN_NORMALIZE(q, qn);
	assert(qn != 0);
#if DEBUG_JEBELEAN >= 3
	printf("q:="); MPN_PRINT(q, qn); printf(";\n");
#endif
    }
    qseq_add_last_mpn(lq, q, qn);
    return qn;
}

/* INPUT: a[i] > a[i+1], |u[i]|, |u[i+1]|, |v[i]|, |v[i+1]|
   Assuming a's to be 1 or 2-word, u's and v's 1-word. 
*/
static inline int jebelean_condition(int i,
				     mp_ptr ai, mp_size_t ain,
				     mp_ptr aip1, mp_size_t aip1n,
				     mp_limb_t aui, mp_limb_t auip1,
				     mp_limb_t avi, mp_limb_t avip1)
{
    mp_limb_t da[2], duv[2];

    assert(ain != 0);
    assert(aip1n != 0);
    if((i & 1) == 0){
	/* ok if a[i+1] >= -u[i+1] = auip1 */
#if DEBUG_JEBELEAN >= 3
	printf("Thm1.1: a[%d+1][%lu]=", i, aip1n); MPN_PRINT(aip1, aip1n);
	printf(" >= %lu?\n", auip1);
#endif
	if(aip1n == 1 && aip1[0] < auip1){
#if DEBUG_JEBELEAN >= 3
	    printf("a[%d+1]=%lu < -u[%d+1]=%lu\n", i, aip1[0], i, auip1);
#endif
	    return 0;
	}
	/* ok if a[i]-a[i+1] >= v[i+1]-v[i] = avip1+avi */
	add_ssaaaa(duv[1], duv[0], 0, avip1, 0, avi);
    }
    else{
	/* ok if a[i+1] >= -v[i+1] = avip1 */
#if DEBUG_JEBELEAN >= 3
	printf("Thm1.1: a[%d+1][%lu]=", i, aip1n); MPN_PRINT(aip1, aip1n);
	printf(" >= %lu?\n", avip1);
#endif
	if(aip1n == 1 && aip1[0] < avip1){
#if DEBUG_JEBELEAN >= 3
	    printf("a[%d+1]=%lu < -v[%d+1]=%lu\n", i, aip1[0], i, avip1);
#endif
	    return 0;
	}
	/* ok if a[i]-a[i+1] >= u[i+1]-u[i] = auip1+aui */
	add_ssaaaa(duv[1], duv[0], 0, auip1, 0, aui);
    }
    /* compute a[i]-a[i+1] */
#if DEBUG_JEBELEAN >= 3
    printf("[%lu] ai:=", ain); MPN_PRINT(ai, ain); printf(";\n");
#endif
    assert(ain == 2 || ai[1] == 0);
    assert(aip1n == 2 || aip1[1] == 0);
    sub_ddmmss(da[1], da[0], ai[1], ai[0], aip1[1], aip1[0]);
    /* compare */
    return ((da[1] > duv[1]) || ((da[1] == duv[1]) && (da[0] >= duv[0])));
}

/* INPUT: 2 >= akp1n >= akp2n >= 1 since akp1 and akp2 are consecutive 
                                   remainders
   SIDE-EFFECT: am1:=akp2+q*ah; aum1:=au1-q*au0; avm1:=av1-q*av0; 
                with am1 = a[k], aum1 = |u[k]|, avm1 = |v[k]|
   We roll back in the Euclidean sequence.		
*/
static inline void rollback_1(mp_limb_t *am1,  mp_size_t *am1n,
			      mp_limb_t *aum1, mp_limb_t *avm1,
			      mp_limb_t *akp1, mp_size_t akp1n,
			      mp_limb_t *akp2,
			      MAYBE_UNUSED mp_size_t akp2n,
			      mp_limb_t aukp1, mp_limb_t aukp2,
			      mp_limb_t avkp1, mp_limb_t avkp2,
			      mp_limb_t qkp1)
{
    /* am1 > aa; |um1| < |u0|, |u1|; |vm1| < |v0|, |v1| */
#if DEBUG_JEBELEAN >= 1
    printf("[%lu] akp2:=", akp2n); MPN_PRINT(akp2, akp2n); printf(";\n");
    printf("[%lu] akp1:=", akp1n); MPN_PRINT(akp1, akp1n); printf(";\n");
    printf("qkp1:=%lu;\n", qkp1);
#endif
    /* the operation should fit into am1[0..1] */
    umul_ppmm(am1[1], am1[0], akp1[0], qkp1);
    if(akp1n == 2){
	/* (*aum1, *avm1) <- akp1[1]*qkp1 */
	umul_ppmm(*aum1, *avm1, akp1[1], qkp1);
	assert((*aum1) == 0);
	am1[1] += *avm1;
    }
    add_ssaaaa(am1[1], am1[0], am1[1], am1[0], akp2[1], akp2[0]);
    *am1n = (am1[1] == 0 ? 1 : 2);
    *aum1 = aukp2 - qkp1 * aukp1;
    *avm1 = avkp2 - qkp1 * avkp1;
}

#define CHANGE_COND 0

#if DEBUG_JEBELEAN >= 3
long u0, u1, u2, v0, v1, v2;
#endif

/** POSTCONDITION: (i, k) = (1, -1) because non ordinary stuff happened;
                   (i, k) = (i, i-2) with a[i]=a[k+2] >= BASE > a[i+1]=a[k+3]
		   All q's are stored.
		   au0 = au[k+1], av0 = av[k+1],
		   au1 = au[k+2], av1 = av[k+2],
		   au2 = au[k+3], av2 = av[k+3]
    We also have: a[i-1]=a[i]*q[i]+a[i+1]
                  u[i+1]=u[i-1]-q[i]*u[i]
                  v[i+1]=v[i-1]-q[i]*v[i]
    Only R->lq is updated	       
 */
static inline
void inside_loop(int *p_i, int *p_k, int *sgn, mp_limb_t *p_q,
		 mp_limb_t *p_au0, mp_limb_t *p_av0, 
		 mp_limb_t *p_au1, mp_limb_t *p_av1, 
		 mp_limb_t *p_au2, mp_limb_t *p_av2, 
		 regular_t R, mp_ptr ah, mp_size_t *p_ahn,
		 mp_ptr bh, mp_size_t *p_bhn,
		 mp_ptr rh, mp_size_t *p_rhn,
		 mp_ptr tp, mp_size_t tp_alloc, int qgcd,
		 int use47)
{
    mp_limb_t w[2], qh[2];
    mp_limb_t q, tmphi, tmplo, sumh;
    mp_limb_t au0, au1, au2, av0, av1, av2;
    mp_size_t rhn = *p_rhn, wn, qpn, ahn = *p_ahn, bhn = *p_bhn;
    int i, j, k = -1, stop = 0;

#if DEBUG_JEBELEAN >= 3
    u0 = 1; u1 = 0; v0 = 0; v1 = 1;
#endif
    au0 = 1; au1 = 0;
    av0 = 0; av1 = 1;
    /* a0 = A, a1 = B                                                   
       Q1 = a0 div a1                                                   
       (U2, V2)=(U0, V0)-Q1*(U1, V1)                                    
       u[i]*A+v[i]*B = a[i]; a[i] decreasing of course
       u_even > 0, u_odd <= 0                                           
       v_even <= 0, v_odd > 0
       sign(u[i]) = -sign(v[i])
       |u[i]| <= |u[i+1]| <= |v[i+1]|, |v[i]| <= |v[i+1||
       => rule (A1) says that only v[i+1] need be computed with 2 words:
       v[i+1] = v[i-1]-q[i]*v[i]
       or av[i+1] = av[i-1]+q[i]*av[i] 
       or av2 = av0 + q * av1
    */
    i = 1;
    *sgn = 1;
    while(stop == 0){
#if DEBUG_JEBELEAN >= 1
	printf("ah:="); MPN_PRINT(ah, 2); printf(";\n");
	printf("bh:="); MPN_PRINT(bh, 2); printf(";\n");
#endif
	if(bhn == 1 && bh[0] == 0){ /* etiq2 */
	    /* cannot happen when use47 > 0: If bh | ah, 
	       then a[next]=0 < BASE, hence will terminate the iteration
	    */
	    assert(use47 == 0);
#if DEBUG_JEBELEAN >= 0
	    fprintf(stderr, "Jebelean: bh == 0\n");
#endif
	    break;
	}
	/* compute .[i+2]
	   ah = a[i-1], bh = a[i] => q = q[i]
	   u0 = u[i-1], u1 = u[i], v0 = v[i-1], v1 = v[i]
	   a2 = a[i+1], u2 = u[i+1], v2 = v[i+1]
	*/
	/* q = ah/bh; a2 = ah-q*bh = ah mod bh; always >= 0 
	   q has ahn-bhn+1 digits, so 1 or 2
	*/
	/* (qp, rh) <- (ah div bh, ah mod bh) */
#if DEBUG_JEBELEAN >= 1
	printf("a[%d]:=", i-1); MPN_PRINT(ah, ahn); printf(";\n");
#endif
	if(ahn == 1){
	    /* really faster? really frequent?
	       should not happen with use47 > 0 since we stop when ah=1,
	       because a[] < BASE
	    */
	    assert(use47 == 0);
	    /* TODO: case q = 1 */
	    q = ah[0] / bh[0];
	    rh[0] = ah[0] - q*bh[0];
	    qpn = 1;
	    /* do not update R->Q */
	    qseq_add_last_mpn(R->lq, &q, 1);
	}
	else{
	    /* TODO: special case bhn == 1? */
	    rh[1] = 0; /* caution! */
	    qpn = jebelean_step(R->lq,qh,rh,ah,ahn,bh,bhn,tp,tp_alloc,qgcd);
	    if(use47 > 0){
		/* a =  b*q1+r2, 0 <= r2 < b
		   b = r2*q2+r3, 0 <= r3 < r2
		   if r2 <  BASE, algo stops
		   if r2 >= BASE, one has q2 <= b/r2 < BASE
		*/
		assert(qpn == 1);
	    }
	    q = qh[0]; /* even if qpn > 1, but only in case use47 == 0 */
	}
	rhn = bhn;
	if(rhn == 2 && rh[1] == 0)
	    rhn = 1;
#if DEBUG_JEBELEAN >= 1
	printf("a[%d]:=", i); MPN_PRINT(bh, bhn);
	printf("; q[%d]:=", i);
	if(ahn == 1) printf("%lu", q); else MPN_PRINT(qh, qpn);
	printf("; a[%d]:=", i+1); MPN_PRINT(rh, rhn);
	printf(";\na[%d]-(a[%d] mod a[%d]);\n", i+1, i-1, i);
#endif
	if(qpn > 1){
	    assert(use47 == 0);
#if DEBUG_JEBELEAN >= 0
	    fprintf(stderr, "Jebelean: qpn=%lu > 1\n", qpn);
#endif
	    break;
	}
#if DEBUG_JEBELEAN >= 3
	u2 = u0-q*u1;
	v2 = v0-q*v1;
#endif
	/* av2 = av0+q*av1; uses rule (A1) */
	/* TODO: (A2) case where everybody fits in a single word */
	/* (4.7)ff: if ahn == 2 [hence a[i+2] >= BASE], then av2 < BASE
	   and we can use single stuff; moreover, conditions
	   are met, so that we don't need to waste time for them!
	*/
	au2 = q*au1+au0;
	if((use47 > 0) && (rhn == 2))
	    av2 = q*av1+av0;
	else{
	    /* it can happen for use47 > 0 */
	    /* (tmphi, tmplo) <- av1*q */
	    umul_ppmm(tmphi, tmplo, av1, q);
	    /* (tmphi, tmplo) += (0, av0) */
	    add_ssaaaa(sumh, av2, tmphi, tmplo, 0, av0);
	    if(sumh != 0){
#if DEBUG_JEBELEAN >= 1
		printf("breaking the wave since sumh=%lu\n", sumh);
#endif
		break;
	    }
	}
#if DEBUG_JEBELEAN >= 3
	printf("u[%d]:=%ld; v[%d]:=%ld;\n", i+1, u2, i+1, v2);
	assert((mp_limb_t)( (*sgn)*u2) == au2);
	assert((mp_limb_t)(-(*sgn)*v2) == av2);
#endif
	if((use47 > 0) && (rhn == 2)){ /* again (4.7) */
	    /* this is the normal case, since ahn = bhn = 2 */
#if DEBUG_JEBELEAN >= 3
	    printf("use47: rhn == 2\n");
#endif
	}
	else{
	    if(use47 > 0){
#if DEBUG_JEBELEAN >= 1
		printf("use47: a[%d+1]=%lu < BASE\n", i, rh[0]);
#endif
		k = i-2; /* so that a[k+2] >= BASE > a[k+3] */
		stop = 1;
	    }
	    else{
		/* v2 and v1 are of opposite parity,
		   hence |v2-v1|=av2+av1 */
		/* Collins's condition */
		/* if((a2=rh < av2) || (bh-a2 < labs(v2-v1))) */
#if DEBUG_JEBELEAN >= 3
		printf("rh[0]=%lu, rh[1]=%lu, av2=%lu\n",rh[0],rh[1],av2);
#endif
		if((rhn == 1) && (rh[0] < av2)){
#if DEBUG_JEBELEAN >= 2
		    printf("breaking the wave since rh=%lu < av2=%lu\n",
			   rh[0], av2);
#endif
		    stop = 1;
		}
		else{
		    /* w <- bh-zr; we have zr < bh, hence w > 0 */
		    mpn_sub_n(w, bh, rh, bhn);
#if DEBUG_JEBELEAN >= 3
		    printf("w:="); MPN_PRINT(w, bhn); printf(";\n");
#endif
		    wn = (bhn == 1 ? 1 : (w[1] == 0 ? 1 : bhn));
#if DEBUG_JEBELEAN >= 3
		    printf("w:="); MPN_PRINT(w, wn);
		    printf("; av1:=%ld;\n", av1);
#endif
		    if((wn == 1) && (w[0] < av1)){
			/* if w < av1, then surely, w < av2+av1 */
#if DEBUG_JEBELEAN >= 2
			printf("breaking the wave since w=%lu < av1=%lu "
			       "already\n", w[0], av1);
#endif
			stop = 1;
		    }
		    else{
			/*mpz_sub_ui(w, w, av1); FIXME: can we have w <= 0? */
			assert(mpn_sub_1(w, w, wn, av1) == 0);
			if((wn == 2) && (w[1] == 0))
			    wn--;
			if((wn == 1) && (w[0] < av2)){
#if DEBUG_JEBELEAN >= 2
			    printf("breaking the wave since w-av1=%lu "
				   "< av2=%lu\n", w[0], av2);
#endif
			    stop = 1;
			}
		    }
		}
	    } /* end of using (4.7) */
	}
	if(stop == 0){
	    /* q[i] is correct and single digit */
	    assert(qpn == 1);
	    //	    regular_add_last_mpn(R, &q, qpn, tp, tp_alloc);
	    /* ah = bh; bh = rh; */
	    for(j = 0; j < bhn; j++){
		ah[j] = bh[j]; bh[j] = rh[j];
	    }
	    /* newan = (an-2)+ahn; newbn = (bn-2)+bhn;
	       the normal case is ahn = 2 and bhn = 2 or 1
	    */
	    ahn = bhn; bhn = rhn;
#if DEBUG_JEBELEAN >= 3
	    printf("((%ld)*ah+(%ld)*bh)-a[%d];\n", u2, v2, i+1);
	    u0 = u1; u1 = u2;
	    v0 = v1; v1 = v2;
#endif
	    au0 = au1; au1 = au2;
	    av0 = av1; av1 = av2;
	    i++;
	    *sgn = -(*sgn);
	}
    } /* while(stop == 0) */
    assert(i == (k+2)); /* FIXME: this one is a bit ridiculous... */
    /* au0 = au[k+1] = au[i-1], au1 = au[k+2], au2 = au[k+3], q = q[k+2] */
#if DEBUG_JEBELEAN >= 3
    printf("i=%d k=%d: au0=%lu, au1=%lu, au2=%lu, q=q[k+2]=q[i]=%lu\n",
	   i, k, au0, au1, au2, q);
#endif
    *p_i = i;
    *p_k = k;
    *p_q = q;
    *p_au0 = au0; *p_av0 = av0;
    *p_au1 = au1; *p_av1 = av1;
    *p_au2 = au2; *p_av2 = av2;
    *p_ahn = ahn; *p_bhn = bhn; *p_rhn = rhn;
}

/* PRECONDITION: u0 = u[k0+1], u1 = u[k0+2], u2 = u[k0+3]
                 ah = a[k0+1], bh = a[k0+2], rh = a[k0+3]
		 q = q[k]
   POSTCONDITION:	 
              am1 = a[k],  um1 = u[k],  vm1 = v[k]
   We can use	      
   a[k] = q[k+1]*a[k+1]+a[k+2]
*/
static inline
int advance_max(regular_t R, int *sgn,
		mp_ptr ah, mp_size_t ahn,
		mp_ptr bh, mp_size_t bhn,
		mp_ptr rh, mp_size_t rhn,
		mp_limb_t *au0, mp_limb_t *au1, mp_limb_t *au2,
		mp_limb_t *av0, mp_limb_t *av1, mp_limb_t *av2,
		mp_limb_t q, int k0, int use47,
		mp_ptr tp, mp_size_t tp_alloc, int qgcd)
{
    mp_limb_t am1[2], aum1, avm1, qk0p1;
    mp_size_t am1n;
    int ok, status, j, k = k0;
#if DEBUG_JEBELEAN >= 1
    long um1, vm1;
    printf("use47.0: correct up to k0=%d\n", k0);
    printf("ah:="); MPN_PRINT(ah, ahn); printf(";\n");
    printf("bh:="); MPN_PRINT(bh, bhn); printf(";\n");
    printf("q:=%lu; // should be q[k0+2] with k0=%d\n", q, k0);
#endif
#if QSEQ_DATA_TYPE == QSEQ_DATA_TAB3
    /* lastq = q[k0+2] */
    assert(R->lq->last-2 >= 0);
    assert(qseq_nl(R->lq, R->lq->last-2) == 1);
    j = R->lq->tind[R->lq->last-2];
    qk0p1 = (j >= 0 ? (mp_limb_t)j : R->lq->tab_large[-j]);
#else
    assert(0);
#endif
#if DEBUG_JEBELEAN >= 1
    /* qlast is always q[k0+1] */
    printf("qk0p1:=%lu; // should be q[k0+1]\n", qk0p1);
#endif
    /* ah = a[k0+1], bh = a[k0+2]
       compute am1 = a[k0], aum1 = au[k0], avm1 = av[k0] */
    rollback_1(am1, &am1n, &aum1, &avm1, ah, ahn, bh, bhn,
		      *au0, *au1, *av0, *av1, qk0p1);
#if DEBUG_JEBELEAN >= 3
    um1 = u1 + qk0p1 * u0; vm1 = v1 + qk0p1 * v0;
#endif
    ok = jebelean_condition(k,am1,am1n,ah,ahn,aum1,*au0,avm1,*av0);
    if(ok == 0){
	status = 0; /* k is just k0; can it happen? */
#if DEBUG_JEBELEAN >= 1
	printf("use47.z: correct up to k0=%d\n", k);
#endif
    }
    else{
	k = k+1; /* k0+1 */
#if DEBUG_JEBELEAN >= 1
	printf("use47.1: correct up to k0+1=%d\n", k);
#endif
	if(use47 < 2)
	    status = 1;
	else{
	    ok = jebelean_condition(k,ah,ahn,bh,bhn,*au0,*au1,*av0,*av1);
	    if(ok == 0)
		status = 1;
	    else{
		k = k+1; // k0+2
#if DEBUG_JEBELEAN >= 1
		printf("use47.2: correct up to k0+2=%d\n", k);
#endif
		if(use47 < 3)
		    status = 2;
		else{
		    ok = jebelean_condition(k, bh, bhn, rh, rhn,
					    *au1, *au2, *av1, *av2);
		    if(ok == 0)
			status = 2;
		    else{
			k = k+1;
			status = 3;
#if DEBUG_JEBELEAN >= 1
			printf("use47.3: correct up to k0+3=%d\n", k);
#endif
		    }
		}
	    }
	}
    } /* if(ok == 0) */
    switch(status){
    case 0:
	assert(0);
	*au1 = aum1; *av1 = avm1;
	*au0 -= q*aum1;
	*av0 -= q*avm1;
#if DEBUG_JEBELEAN >= 3
	u0 += q*um1;
	v0 += q*vm1;
#endif
	/* we probably need 3 up's */
	regular_oslash(R, tp, tp_alloc);
	regular_oslash(R, tp, tp_alloc);
	break;
    case 1:
	/* at this point, au1 = au[k+1], au0 = au[k],
	   we want to have au1 = au[k], au0 = au[k-1]
	*/
	*au1 = *au0; *av1 = *av0;
	*au0 = aum1; *av0 = avm1;
	*sgn = - (*sgn);
#if DEBUG_JEBELEAN >= 3
	u1 = u0; v1 = v0; u0 = um1; v0 = vm1;
#endif
        qseq_remove_last(R->lq);
        qseq_remove_last(R->lq);
	break;
    case 2:
	/* throw away last q, all quantities are ok */
	qseq_remove_last(R->lq);
	break;
    case 3:
	/* q?? is ok, au0 <- au[k0+2], au1 <- au[k0+3], ... */
	*au0 = *au1; *au1 = *au2;
	*av0 = *av1; *av1 = *av2;
	*sgn = -(*sgn);
#if DEBUG_JEBELEAN >= 3
	v0 = v1; v1 = v2;
	u0 = u1; u1 = u2;
#endif
	/* ah = bh; bh = rh; so that ah = a[k0+2], bh = a[k0+3]
	 */
	for(j = 0; j < bhn; j++){
	    ah[j] = bh[j]; bh[j] = rh[j];
	}
	ahn = bhn; bhn = rhn;
	break;
    default:
	printf("gcd_jebelean: should not happen!\n");
	assert(0);
    }
    return k;
}

/** tp <- sgn*(au*a-av*b) with eventually tp >= 0. */
static inline
void jebelean_update_aux(mp_ptr tp,
			 mp_ptr a, mp_size_t an,
			 mp_ptr b, mp_size_t bn,
			 mp_limb_t au, mp_limb_t av, int sgn)
{
    mp_limb_t q;
    
    if(sgn == 1){
	tp[an] = mpn_mul_1(tp, a, an, au);
#if DEBUG_JEBELEAN >= 2
	printf("au*a="); MPN_PRINT(tp, an+1); printf("\n");
#endif
	/* tp[0..bn[ -= av*b[0..bn[ */
	q = mpn_submul_1(tp, b, bn, av);
	assert(mpn_sub_1(tp+bn, tp+bn, an+1-bn, q) == 0);
    }
    else{
	/* av*b >= au*a */
	/* tp[0..bn+1[ = b[0..bn[ * av */
	tp[bn] = mpn_mul_1(tp, b, bn, av);
	/* clear tp[bn+1..an+2[ */
	MPN_ZERO(tp+bn+1, an-bn+1);
#if DEBUG_JEBELEAN >= 2
	printf("av0*B="); MPN_PRINT(tp, bn+1); printf("\n");
#endif
	/* tp[0..an[ -= au*a[0..an[ */
	q = mpn_submul_1(tp, a, an, au);
	if(q > 0){
	    assert(bn+1-an > 0);
	    assert(mpn_sub_1(tp+an, tp+an, bn+1-an, q) == 0);
	}
    }
}

/**
   Updating the big numbers.
   (A, B) <- (u0*A+v0*B, u1*A+v1*B) = (sgn*(au0*A-av0*B), -sgn*(au1*A-av1*B)).
   PRECONDITION: au0 <= au1; av0 <= av1
*/
static inline void jebelean_update(mp_ptr A, mp_size_t *p_an,
				   mp_ptr B, mp_size_t *p_bn,
				   mp_size_t n,
				   mp_limb_t au0, mp_limb_t au1,
				   mp_limb_t av0, mp_limb_t av1,
				   int sgn, mp_ptr tp)
{
    mp_ptr wp;
    mp_size_t an = *p_an, bn = *p_bn;
    
#if DEBUG_JEBELEAN >= 3
    printf("u0:=%ld; v0:=%ld;\n", u0, v0);
    assert((mp_limb_t)( sgn*u0) == au0);
    assert((mp_limb_t)(-sgn*v0) == av0);
    printf("u1:=%ld; v1:=%ld;\n", u1, v1);
    assert((mp_limb_t)(-sgn*u1) == au1);
    assert((mp_limb_t)( sgn*v1) == av1);
#endif
    /* au0 <= au1; av0 <= av1 */
#if DEBUG_JEBELEAN >= 4
    printf("au0:=%lu; au1:=%lu; av0:=%lu; av1:=%lu;\n", au0, au1, av0, av1);
#endif
    /* t <- u0*A+v0*B; t <-  sgn*(au0*A-av0*B) is eventually positive */
    jebelean_update_aux(tp, A, an, B, bn, au0, av0, sgn);
    /* w <- u1*A+v1*B; w <- -sgn*(au1*A-av1*B) is eventually positive */
    wp = tp+n+1; /* wp[0..n+1[ */
    jebelean_update_aux(wp, A, an, B, bn, au1, av1, -sgn);
    MPN_NORMALIZE(tp, an);
    MPN_COPY(A, tp, an);
    while(bn > 0 && wp[bn-1] == 0){
	B[bn-1] = 0; /* clear at the same time... */
	bn--;
    }
    MPN_COPY(B, wp, bn);
    *p_an = an;
    *p_bn = bn;
#if DEBUG_JEBELEAN >= 1
    printf("updated_A:="); MPN_PRINT(A, an); printf(";\n");
    printf("updated_B:="); MPN_PRINT(B, an); printf(";\n");
#endif
}

/** INPUT: A > B > rmin;
           R[Rlast..R->last[ contains the next correct quotients.
    POSTCONDITION: A >= rmin > B.
 */
MAYBE_UNUSED
static void jebelean_descent(mp_ptr A, mp_size_t *an,
			     mp_ptr B, mp_size_t *bn,
			     mp_ptr rmin, mp_size_t rminn,
			     regular_t R, int Rlast,
			     mp_ptr tp, mp_size_t tp_alloc)
{
    mp_limb_t c;
    mp_size_t rn;
    int i, j;

#if DEBUG_JEBELEAN >= 1
    printf("Rin[%d..%d[:=", Rlast, R->lq->last);
    qseq_print(R->lq); printf(";\n");
#endif
    for(i = Rlast; i < R->lq->last; ){
	j = R->lq->tind[i];
	assert(j >= 0);
#if DEBUG_JEBELEAN >= 1
	printf("A:="); MPN_PRINT(A, *an); printf(";\n");
	printf("B:="); MPN_PRINT(B, *bn); printf(";\n");
	printf("q:=%d;\n", j);
#endif
	if((c = mpn_submul_1(A, B, *bn, (mp_limb_t)j)) != 0)
	    assert(mpn_sub_1(A+(*bn), A+(*bn), (*an)-(*bn), c) == 0);
	MPN_NORMALIZE(A, *an);
	if((i == R->lq->last-1)
	   || (*an < rminn || (*an == rminn && mpn_cmp(A, rmin, rminn) < 0))){
	    /* TODO: use MPN_SWAP */
#if 0
	    MPN_COPY(tp, A, *an);
	    MPN_COPY(A, B, *bn);
	    MPN_COPY(B, tp, *an);
#else
	    printf("swappy2\n");
	    MPN_SWAP(A, B, *bn);
#endif
	    MPN_ZERO(B+(*an), (*bn)-(*an));
	    rn = *an; *an = *bn; *bn = rn;
#if DEBUG_JEBELEAN >= 1
	    printf("exiting half-way\n");
#endif
	    break;
	}
	i++;
	j = R->lq->tind[i];
	assert(j >= 0);
#if DEBUG_JEBELEAN >= 1
	printf("B:="); MPN_PRINT(B, *bn); printf(";\n");
	printf("A:="); MPN_PRINT(A, *an); printf(";\n");
	printf("q:=%d;\n", j);
#endif
	if((c = mpn_submul_1(B, A, *an, (mp_limb_t)j)) != 0)
	    assert(mpn_sub_1(B+(*an), B+(*an), (*bn)-(*an), c) == 0);
	MPN_NORMALIZE(B, *bn);
	if(*bn < rminn || (*bn == rminn && mpn_cmp(B, rmin, rminn) < 0))
	    break;
	i++;
    }
#if DEBUG_JEBELEAN >= 1
    if(i >= R->lq->last){
	/* we exhausted R and still b > rmin */
	count_leading_zeros(j, rmin[rminn-1]);
	printf("bn=%lu rminn=%lu %lu %lu -> clr=%d\n",
	       *bn, rminn, B[*bn-1], rmin[rminn-1], j);
    }
#endif
#if DEBUG_JEBELEAN >= 1
    printf("bn=%lu rminn=%lu %lu %lu\n", *bn, rminn, B[*bn-1], rmin[rminn-1]);
#endif
    assert((*an > *bn) || (*an == *bn && mpn_cmp(A, B, *an) >= 0));
    /* we need remove the last pieces */
    Rlast = R->lq->last;
    for(j = Rlast-1; j > i; j--)
	regular_oslash(R, tp, tp_alloc);
#if DEBUG_JEBELEAN >= 1
    printf("Rout:="); qseq_print(R->lq); printf(";\n");
#endif
}

#define OLD_STUFF 0 /* 0 for a single flush (new); 1 for old stuff */
 
/** INPUT: A > B with an+1 words, an >= bn >= 2, A > rmin.
           tp[0..2*an+2[
	   R a regular pair
    PRECONDITION: R->Q == R->lq
    SIDE-EFFECT: 
            if qgcd == 0:
	         A >= rmin[0..rminn[ > B; 
                 or ||A|| >= m > ||B||
	    elif qgcd == 1:
	         ||Q_i|| <= base^m < ||Q_{i+1}|| if m > 0
		 ||Q_i|| <= emin < ||Q_{i+1}||   if m == 0
            A[n] (resp. B[n]) contain the number of digits of A (resp. B).
	    R is updated with the correct quotients found.
    POSTCONDITION: R->Q == R->lq by accumulation.
    use47 refers to the use of condition (4.7) p. 152 and all the consequences
    leading to the use of i, i+1 or i+2.
    When use47 = 0, nothing of this is used;
                 1, test a_{i+2} >= 2^GMP_NUMB_BITS;
		 2, test rhs;
		 3, test both sides.
*/
void jebelean_gcd(regular_t R,
		  mp_ptr A, mp_size_t an,
		  mp_ptr B, mp_size_t bn,
		  mp_size_t m, mp_ptr rmin, mp_size_t rminn,
		  mp_ptr tp, mp_size_t tp_alloc, int qgcd)
{
    mp_limb_t ah[3], bh[3], rh[2];
    mp_limb_t q, au0, au1, au2, av0, av1, av2;
    mp_size_t n = an, ahn, bhn, rhn, tmpn, qpn;
    int i, sgn, use47 = 3, k = -1, cond;
    unsigned int clz;
    /* lq[0..last[ is already filled */
    int Rfirst, Rtmp, Rlast;
#if COUNT_STEPS
    int nsteps = 0;
#endif
#if DEBUG_JEBELEAN >= 1
    mp_size_t Aorign = an, Borign = bn;
    mp_ptr Aorig = (mp_ptr)malloc(an * sizeof(mp_limb_t));
    mp_ptr Borig = (mp_ptr)malloc(bn * sizeof(mp_limb_t));
    MPN_COPY(Aorig, A, an);
    MPN_COPY(Borig, B, bn);
#endif
    if(qgcd == 1)
	R->updateQ = 1;
#if OLD_STUFF == 0
    Rfirst = R->lq->last;
#endif
    while(1){ /* main loop */
	/* at this point, R->Q == R->lq, memorize for flushing */
#if OLD_STUFF > 0
	Rfirst = R->lq->last;
#endif
#if DEBUG_JEBELEAN >= 1
	printf("an=%lu bn=%lu m=%lu\n", an, bn, m);
	printf("A:="); MPN_PRINT(A, an); 
	printf(";\nB:="); MPN_PRINT(B, bn);
	if(rmin != NULL){
	    printf(";\nrmin:="); MPN_PRINT(rmin, rminn);
	}
	else
	    printf(";\n||A|| >= m? %d; ||B|| >= m? %d\n",
		   mpn_gcd_is_norm_ge(A, an, m), mpn_gcd_is_norm_ge(B, bn, m));
	printf(";\n");
	regular_print(R); printf(";\n");
#endif
	if(gcd_stopping_condition(B, bn, m, rmin, rminn, 0, R, qgcd) != 0){
#if DEBUG_JEBELEAN >= 1
	    printf("Jebelean: stopping condition met\n");
#endif
	    break;
	}
	/* at this point, A > B >= rmin or ||B|| >= m */
	assert(an >= bn);
	if(bn <= JEBELEAN_EUCLID_THRESHOLD){ /* etiq0 */
	    /* exit to perform Euclid gcd */
#if DEBUG_JEBELEAN >= 1
	    printf("Jebelean: bn=%lu <= euclid_threshold=%d\n",
		   bn, JEBELEAN_EUCLID_THRESHOLD);
#endif
	    break;
	}
	/* at this point, an >= bn >= JEBELEAN_EUCLID_THRESHOLD >= 3 */
	if((an-bn) > 2){ /* etiq4 */
#if DEBUG_JEBELEAN >= 1
	    printf("Jebelean: an-bn=%lu > 2; exiting.\n", an-bn);
#endif
	    break;
	}
#if COUNT_STEPS
	nsteps++;
#endif
	/* maximaly shift a into ah and shift b into bh of the account */
	count_leading_zeros(clz, A[an-1]);
	mpn_lshift(ah, A+(an-3), 3, clz);
	mpn_lshift(bh, B+(an-3), 3, clz);
	/* shift back */
	ah[0] = ah[1]; ah[1] = ah[2];
	bh[0] = bh[1]; bh[1] = bh[2];
	/* contrary to gmp, we always have ahn, bhn > 0 */
	ahn = 2;
	bhn = (bh[1] == 0 ? 1 : 2);
	assert((bhn == 1) || (ah[1] >= bh[1]));
#if DEBUG_JEBELEAN >= 1
	printf("ahm :="); MPN_PRINT2(ah, ahn); 
        printf(";\nbhm :="); MPN_PRINT2(bh, bhn);
	if(rmin != NULL && rminn >= 2){
	    printf(";\nrmin:="); MPN_PRINT2(rmin+(rminn-2), 2);
	}
	printf(";\n");
#endif
	if(bhn == 1){ /* etiq1 */
	    /* we have ah[1] > BASE/2, so quotient will be enormous */
# if DEBUG_JEBELEAN >= 1	    
	    fprintf(stderr, "etiq1: bhn == 1 => switching to Euclid\n");
# endif
	    break;
	}
	/* at this point: ahn = bhn = 2 */
	Rlast = R->lq->last;
	inside_loop(&i, &k, &sgn, &q, &au0, &av0, &au1, &av1, &au2, &av2,
		    R, ah, &ahn, bh, &bhn, rh, &rhn, tp, tp_alloc,
		    qgcd, use47);
	/* au0 = au[k+1], ...
	   au1 = au[k+2], ...
	   au2 = au[k+3], ...
	 */
#if COUNT_STEPS
	printf("k=%d", i);
#endif
	if(use47 == 0 || i == 1){
	    /* use47 and i = 1 => k = -1; we'd better stop!!! */
	    k = i;
	}
	else{ /* a[k+2] >= BASE > a[k+3], so final i is in k, k+1, k+2 */
	    /* only R->lq is updated */
	    i = advance_max(R, &sgn, ah, ahn, bh, bhn, rh, rhn,
			    &au0, &au1, &au2, &av0, &av1, &av2, q,
			    k, use47, tp, tp_alloc, qgcd);
#if DEBUG_JEBELEAN >= 3
	    printf("au0=au[%d]=%lu au1=au[%d]=%lu\n",
		   i-1, au0, i, au1);
	    printf("av0=av[%d]=%lu av1=av[%d]=%lu sgn=%d\n",
		   i-1, av0, i, av1, sgn);
#endif
	}
#if COUNT_STEPS
	printf(" => final i=%d\n", i);
#endif
	if(i == 1){ /* etiq3: rare event... */
	    mp_ptr qq = (mp_ptr)malloc((an+1) * sizeof(mp_limb_t));
	    mp_ptr rr = (mp_ptr)malloc((an+1) * sizeof(mp_limb_t));
	    mp_size_t qqn;
#if DEBUG_JEBELEAN >= 1
	    fprintf(stderr, "Jebelean: weird loop => multiprecision step\n");
#endif
	    /* FIXME: put this somewhere else, so that we can update R->Q? */
	    /* one quotient was added anyway */
	    qseq_remove_last(R->lq);
	    /* (A, B) <- (A mod B, A) */
	    qqn = euclidean_step(R->lq, qq, rr, A, an, B, bn, tp, tp_alloc);
#if OLD_STUFF > 0
	    /* HERE: almost ready for the final trick: do not update R->Q
	       here, but *outside* the whole loop...!! */
	    hgcd_matrix_mul_q(R->Q, qq, qqn, tp, tp_alloc);
#endif
	    MPN_COPY(A, rr, bn);
	    free(qq);
	    free(rr);
	    MPN_SWAP(A, B, bn);
	    tmpn = an; an = bn; bn = tmpn;
	    /* clear B[bn..an[ */
	    MPN_NORMALIZE(B, bn);
	    MPN_ZERO(B+bn, an-bn);
	}
	else{
#if 0
	    /* does not pay off, apparently => more thinking? */
	    if((rmin != NULL) && (bn - rminn) <= 1)
		jebelean_descent(A, &an, B, &bn, rmin, rminn, R, Rlast, tp);
	    else
#endif
		jebelean_update(A, &an, B, &bn, n, au0, au1, av0, av1, sgn,tp);
#if OLD_STUFF > 0
	    regular_flush_lq(R, Rfirst);
#endif
	}
#if DEBUG_JEBELEAN >= 1 /* not clear it should work */
	printf("LQ:="); qseq_print(R->lq); printf(";\n");
#endif
#if DEBUG_JEBELEAN >= 3 /* not clear it should work */
	if(R->lq != NULL && qseq_is_empty(R->lq) == 0){
	    Rtmp = R->lq->first;
	    R->lq->first = Rfirst;
	    printf("lq:="); qseq_print(R->lq); printf(";\n");
	    qseq_check(R->lq, Aorig, Aorign, Borig, Borign, A, an, B, bn);
	    R->lq->first = Rtmp;
	}
#endif
    } /* while(1) */
#if OLD_STUFF == 0
    regular_flush_lq(R, Rfirst);
#endif
    /* we should exit with R->Q == R->lq */
#if COUNT_STEPS
    printf("nsteps=%d\n", nsteps);
#endif
#if DEBUG_JEBELEAN >= 1
    check_Q_from_lq(R->Q, R->lq);
    printf("at the end of main loop: A:="); MPN_PRINT(A, an);
    printf("; B:="); MPN_PRINT(B, bn);
    if(rmin != NULL){
	printf("; rmin:="); MPN_PRINT(rmin, rminn);
    }
    printf(";\n");
    printf("R:="); regular_print(R); printf(";\n");
#endif
    /*    if(backup == 0){*/
    if(gcd_stopping_condition(B, bn, m, rmin, rminn, 0, R, qgcd) == 0){
	/* B >= rmin or ||B|| >= m */
#if DEBUG_JEBELEAN >= 1
	printf("Jebelean: performing Euclid gcd\n");
#endif
        euclid_gcd(R, A, an, B, bn, m, rmin, rminn, tp, tp_alloc, qgcd);
	A[n] = A[an];
	B[n] = B[an];
    }
    else{
	/* rmin > B or m > ||B|| */
	/* at this point, we know nothing on A; for instance there can be
	   a gap m >= an > bn */
	if(m == 0)
	    cond=((an > rminn) || (an == rminn && mpn_cmp(A,rmin,rminn) >= 0));
	else
	    cond = mpn_gcd_is_norm_ge(A, an, m);
	if(cond != 0){
	    /* A >= rmin > B or ||A|| >= m > ||B|| */
	    A[n] = an;
	    B[n] = bn;
	}
	else{ /* rmin >= A >= B or m > ||A|| */
#if DEBUG_JEBELEAN >= 1
	    printf("Jebelean: we must backup\n");
	    printf("an=%lu bn=%lu rminn=%lu\n", an, bn, rminn);
	    printf("A >= rmin: %d; B >= rmin: %d\n",
		   (an>rminn || (an == rminn && mpn_cmp(A, rmin, rminn) >= 0)),
		   (bn>rminn || (bn == rminn && mpn_cmp(B, rmin, rminn) >= 0)));
#endif
	    MPN_ZERO(B+bn, an-bn);
	    /* if OLD_STUFF is 0, R->Q == R->lq, so we must oslash */
	    an=euclid_gcd_backwards(R,A,an,B,bn,m,rmin,rminn,GMP_NUMB_BITS);
	    bn = an;
	    MPN_NORMALIZE(A, an);
	    MPN_NORMALIZE(B, bn);
	    A[n] = an;
	    B[n] = bn;
	}
    }
    /* useful only when Jebelean is called directly */
#if DEBUG_JEBELEAN >= 3
    if(R->lq != NULL && qseq_is_empty(R->lq) == 0){
	Rtmp = R->lq->first;
	R->lq->first = Rfirst;
	printf("lq:="); qseq_print(R->lq); printf(";\n");
	qseq_check(R->lq, Aorig, Aorign, Borig, Borign, A, A[n], B, B[n]);
	R->lq->first = Rtmp;
    }
    free(Aorig);
    free(Borig);
#endif
}

