/*! \file
    \brief Thull-Yap GCD algorithm
*/

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>

#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"

#include "gcd_utils.h"
#include "qseq.h"
#include "gcd_matrix.h"
#include "regular.h"

#include "gcd_common.h"
#include "gcd_euclid.h"
#include "gcd_thull_yap.h"

#define DEBUG_TY       0
#define DEBUG_MAT2x2   0
#define PRINT_WARNINGS 0

#define TIMINGS        0
#define STATS          0

#if DEBUG_TY >= 1 || TIMINGS > 0
int ty_level = 0;
#endif

#ifdef FINDTHRESHOLD
int ty_small_threshold = 200;
#endif

static inline int get_small_threshold()
{
#ifdef FINDTHRESHOLD
    return ty_small_threshold;
#else
    return TY_SMALL_THRESHOLD;
#endif
}

#if DEBUG_TY >= 1
/** INPUT: a, b
    SIDE-EFFECT: (ap, bp) <- M * (a, b)
    REM: we may expect ap << a, bp << b.

    M^(-1) = [[p, q], [r, s]]^(-1) = det(M) * [[s, -q], [-r, p]].
    absinv = [[s, q], [r, p]].
*/
static void mpz_hgcd_matrix_apply_inverse(mpz_t ap, mpz_t bp,
					  struct hgcd_matrix *absinv, int sgn,
					  mpz_t a, mpz_t b)
{
    mpz_t z;

    mpz_init_set_ui(z, 0);
#if DEBUG_TY >= 2
    printf("sgn:=%d;\n", sgn);
    printf("s:="); MPN_PRINT(absinv->p[0][0], absinv->n); printf(";\n");
    printf("q:="); MPN_PRINT(absinv->p[0][1], absinv->n); printf(";\n");
    printf("r:="); MPN_PRINT(absinv->p[1][0], absinv->n); printf(";\n");
    printf("p:="); MPN_PRINT(absinv->p[1][1], absinv->n); printf(";\n");
#endif
    /* ap <- s * a */
    printf("a:="); MPN_PRINT(PTR(a), SIZ(a)); printf(";\n");
    printf("b:="); MPN_PRINT(PTR(b), SIZ(b)); printf(";\n");
    MPZ_SET_MPN(z, absinv->p[0][0], absinv->n);
    mpz_mul(ap, z, a);
#if DEBUG_TY >= 2
    gmp_printf("s*a:=%Zd;\n", ap);
#endif
    /* ap -= q * b */
    MPZ_SET_MPN(z, absinv->p[0][1], absinv->n);
    mpz_submul(ap, z, b);
#if DEBUG_TY >= 2
    gmp_printf("(%d)*ap:=%Zd;\n", sgn, ap);
#endif
    /* bp <- p * b */
    MPZ_SET_MPN(z, absinv->p[1][1], absinv->n);
    mpz_mul(bp, z, b);
#if DEBUG_TY >= 2
    gmp_printf("p*b:=%Zd;\n", bp);
#endif
    /* bp -= r * a */
    MPZ_SET_MPN(z, absinv->p[1][0], absinv->n);
    mpz_submul(bp, z, a);
#if DEBUG_TY >= 2
    gmp_printf("(%d)*bp:=%Zd;\n", sgn, bp);
#endif
    mpz_clear(z);
    if(sgn == -1){
	mpz_neg(ap, ap);
	mpz_neg(bp, bp);
    }
}
#endif

#if DEBUG_TY >= 1
/** REM: essentially never used, or at least, rarely, except for debug. 
    (ap, bp) = M^{-1} (a, b)
*/
static void mpz_qseq_apply_inverse(mpz_t ap, mpz_t bp, qseq_t Q, mpz_t a, mpz_t b)
{
#if TIMINGS >= 1
    double tt = runtime();
#endif
    struct hgcd_matrix M, invM;
#if DEBUG_TY >= 1
    gmp_printf("mpz_qseq_apply_inverse: a:=%Zd;\nb:=%Zd;\n", a, b);
#endif
#if 0
    mpz_qseq_apply_inverse_plain(ap, bp, Q, a, b);
#else
    if(qseq_is_empty(Q)){
	mpz_set(ap, a);
	mpz_set(bp, b);
	return;
    }
    qseq_build_product(&M, Q);
#if TIMINGS >= 1
    fprintf(stderr, "{%d} build_product: %lf\n", ty_level, runtime()-tt);
#endif
#if DEBUG_MAT2x2 >= 1
    printf("lq:="); qseq_print(Q); printf(";\n");
    printf("Mf:="); hgcd_matrix_print(&M); printf(";\n");
#endif
    hgcd_matrix_abs_inverse(&invM, &M);
#if DEBUG_MAT2x2 >= 1
    printf("invM:="); hgcd_matrix_print(&invM);
#endif
    mpz_hgcd_matrix_apply_inverse(ap, bp, &invM, regular_determinant(Q), a, b);
#if DEBUG_MAT2x2 >= 1
    gmp_printf("a:=%Zd;\nb:=%Zd;\nap:=%Zd;\nbp:=%Zd;\n", a, b, ap, bp);
#endif
    hgcd_matrix_clear(&M);
    hgcd_matrix_clear(&invM);
#endif
}
#endif

#if 0 // not used, deprecated?
/* Plain version, where we multiply one q-matrix at a time.
   Note that [[0, 1], [1, -q]] * [eps*x, -eps*y] = [-eps*y, eps * (x+q*y)]
   starting from [[]] * [-a1, b1] with eps = -1.
   This is coherent with the sign in build_product, but be careful...
   At the end, we have eps = delta = det(R).

   TODO: this should disappear, since M should have been updated earlier on.
         And M is not used.
 */
static void apply_inverse_plain(mp_ptr *w, mp_size_t *wn,
				struct hgcd_matrix *M, qseq_t R,
				mp_ptr a1, mp_size_t a1n,
				mp_ptr b1, mp_size_t b1n)
{
    mp_ptr tmp = NULL;
    mp_size_t tmpn;
    int i, j, eps = 1;

    w[0] = (mp_ptr)malloc((a1n << 1) * sizeof(mp_limb_t));
    wn[0] = a1n;
    MPN_COPY(w[0], a1, a1n);
    w[1] = (mp_ptr)malloc((a1n << 1) * sizeof(mp_limb_t));
    wn[1] = b1n;
    MPN_COPY(w[1], b1, b1n);
#if QSEQ_DATA_TYPE == QSEQ_DATA_TAB1 || QSEQ_DATA_TYPE == QSEQ_DATA_TAB2 || QSEQ_DATA_TYPE == QSEQ_DATA_TAB3
    for(i = R->first; i < R->last; i++){
	/* [[0, 1], [1, -q]] * [w[0], w[1]] 
	   or w[0] += q*w[1] and swap
	 */
	mp_size_t len = qseq_nl(R, i);

#if DEBUG_TY >= 2
	printf("w0:="); MPN_PRINT(w[0], wn[0]);
	printf(";\nw1:="); MPN_PRINT(w[1], wn[1]); printf(";\n");
#endif
	tmpn = 2 + max(wn[0], (len == 0 ? 1 : len) + wn[1]);
	tmp = realloc(tmp, tmpn * sizeof(mp_limb_t));
	MPN_COPY(tmp, w[0], wn[0]);
	MPN_ZERO(tmp+wn[0], tmpn-wn[0]);
# if TRICK_FOR_ONE == 1
	if(qseq_is_cell_one(R, i) != 0){
	    mp_limb_t q = 1;
	    gcd_addmul(tmp, wn, &q, 1, w[1], wn[1]);
	}
	else
# endif
# if QSEQ_DATA_TYPE == QSEQ_DATA_TAB3
	    {
		j = R->tind[i];
		if(j >= 0){
		    mp_limb_t lq = j;
		    printf("tmp=%lu, wn[0]=%lu, wn[1]=%lu\n",
			   tmpn, wn[0], wn[1]);
		    gcd_addmul(tmp, wn, &lq, 1, w[1], wn[1]);
		}
		else{
		    j = -j;
		    gcd_addmul(tmp, wn, R->tab_large+j, R->tab_large_nl[j],
			       w[1], wn[1]);
		}
	    }
# else
	    gcd_addmul(tmp, wn, qseq_n(R, i), len, w[1], wn[1]);
# endif
	/* swap */
	MPN_COPY(w[0], w[1], wn[1]); wn[0] = wn[1];
	MPN_COPY(w[1], tmp, wn[0]); wn[1] = wn[0];
	eps = -eps; /* FIXME: probably useless */
    }
#else
    assert(0);
#endif
}
#endif /* deprecated? */

/* INPUT: R->Q is the cumulative matrix that contains [0..lq->first[.
          is_ab == 1 for the first call (hence original a and b),
                   0 for the second call
   SIDE-EFFECT: we extend using [lq->first..lq->last[.
   We are computing 
     [[s, -q], [-r, p]]*[-a1, b1] = [[s, q], [r, p]]*[a1, b1]
   which is the answer to be multiplied by 1/delta = delta.
*/
static void apply_inverse_with_product(mp_ptr *w, mp_size_t *wn,
				       regular_t R,
				       mp_ptr a1, mp_size_t a1n,
				       mp_ptr b1, mp_size_t b1n,
				       int is_ab)
{
#if TIMINGS >= 1
    double tt = runtime();
#endif
    mp_ptr A[4], v[2];
    mp_size_t An[4], nA, nv, vn[2];
    int i;
    /* real inverse is [[s, -q], [-r, p]]/det(R->Q) */
    /* A <- [[s, q], [r, p]] from [[p, q], [r, s]] */
    /* convert */
    if(is_ab != 1){
	/* is_ab == 0 => same size, no fancy code */
#if DEBUG_TY >= 1
	printf("aiwpR:="); regular_print(R); printf(";\n");
#endif
	A[0] = R->Q->p[1][1];
	A[3] = R->Q->p[0][0];
	A[1] = R->Q->p[0][1];
	A[2] = R->Q->p[1][0];
	for(i = 0; i < 4; i++){
	    An[i] = R->Q->n;
	    MPN_NORMALIZE(A[i], An[i]);
	}
	v[0] = a1; vn[0] = a1n;
	v[1] = b1; vn[1] = b1n;
	nA = R->Q->n;
	nv = max(a1n, b1n);
    }
    else{
#if 1
	printf("Not converted\n");
	exit(-1);
#else
	/* equalize all An's and vn's */
	feed_A_v(A, &nA, v, &nv, Mtmp, a1, a1n, b1, b1n);
	/* below: used to build w[0..2[ */
	for(i = 0; i < 4; i++)
	    An[i] = nA;
	for(i = 0; i < 2; i++)
	    vn[i] = nv;
#endif
    }
#if DEBUG_TY >= 2
    /*    printf("absinvM_%lu:=", mat2x2_size(Mtmp)); mat2x2_print(Mtmp);*/
    printf("v0:="); MPN_PRINT(v[0], vn[0]); printf(";\n");
    printf("v1:="); MPN_PRINT(v[1], vn[1]); printf(";\n");
#endif
    /* w <- A * v
       size(w[i]) = size(A[2*i]*v[0]+A[2*1+1]*v[1])
     */
#if TIMINGS >= 1
    tt = runtime();
#endif
    for(i = 0; i < 2; i++){
	wn[i] = 1 + max(An[2*i]+vn[0], An[2*i+1]+vn[1]);
#if DEBUG_TY >= 2
	printf("SW: size(w[%d])=%lu\n", i, wn[i]);
#endif
	w[i] = (mp_ptr)malloc(wn[i] * sizeof(mp_limb_t));
    }
    if(is_ab == 0)
	mat2x2_mul_vec_mpn_plain(w, wn, A, An, v, vn);
    else
#if 1
	mat2x2_mul_vec_mpn222(w, wn, A, nA, v, nv);
#else
	mat2x2_mul_vec_mpn(w, wn, A, An, v, vn);
#endif
#if TIMINGS >= 1
    fprintf(stderr, "{%d} mat_mul_%d: %lu %lu %lf\n",
	    ty_level, is_ab, An[0], vn[0], runtime()-tt);
#endif
}

/* The crucial remark is the following:
   inv([[p, q], [r, s]]) = det(M)*[[s, -r], [-q, p]]
   and invM has to be applied to (-a1, b1). So
   invM * (-a1, b1) = (-(s*a1+r*b1), q*a1+p*b1)
   here, we really compute (s*a1+r*b1, q*a1+p*b1) with only positive entries!
   INPUT: is_ab == 1 for the first call (hence original a and b),
                   0 for the second call
   TODO: Q should disappear and/or this entire function which only calls
         extend_with_product.
*/
static void thull_yap_compute_special_product(mp_ptr *w, mp_size_t *wn,
					      regular_t R,
					      mp_ptr a1, mp_size_t a1n,
					      mp_ptr b1, mp_size_t b1n,
					      int is_ab)
{
#if TIMINGS > 0
    double tt = runtime();
#endif
#if DEBUG_TY >= 1
    printf("apply_inverse with R:="); regular_print(R); printf(";\n");
# if DEBUG_TY >= 100
    check_Q_from_lq(R->Q, R->lq);
# endif
#endif
#if 0 // deprecated
    if(0 && qseq_card(Q) < TY_EXTEND_THRESHOLD)
	apply_inverse_plain(w, wn, M, Q, a1, a1n, b1, b1n);
    else
#endif
	apply_inverse_with_product(w, wn, R, a1, a1n, b1, b1n, is_ab);
#if TIMINGS >= 2
    printf("a1n: %lu b1n: %lu nq: %d %lf\n",
	   a1n, b1n, qseq_card(R->lq), runtime()-tt);
#endif
}

/**********************************************************************
       hGCD a` la Thull/Yap, p. 69
 **********************************************************************/

/* U -- M --> V <=> U = M*V, M = [[Q, 1], [1, 0]], det(M) = -1.
   a, b straddle c if a >= c > b.
   M^(-1) = [[p, q], [r, s]]^(-1) = det(M)*[[s, -q], [-r, p]].
 */

/** x = ||a|| = log_B(|a|);
    OUTPUT: 1+ceil(log_B(|a|)/2).
    If a = B^k, ||a|| = k+1 and k+1-m <= k div 2.
    If B^(n-1) < a < B^n, then ||a|| = n and ...
    1+ceil(||a||/2) = 1 + ceil(size(a)/2).
*/
static inline mp_size_t thull_yap_bound(mp_ptr a, mp_size_t an)
{
    if(mpn_is_power_of_base(a, an) != -1){
	an--;
	return 1 + (an >> 1) + (an & 1);
    }
    else{
	an--;
	return 2 + (an >> 1);
    }
}

/* Do we have ||a|| < bound(a)?
   TODO: change this? We may have computed bound already. */
static inline int thull_yap_is_small_1(mp_ptr a, mp_size_t an)
{
#if DEBUG_TY >= 3
    printf("is_small_1: a(%lu):=", an); MPN_PRINT(a, an);
    printf("; floor(norm)=%lu bound=%lu\n", mpn_gcd_floor_norm(a, an),
	   thull_yap_bound(a, an));
#endif
    return mpn_gcd_is_norm_lt(a, an, thull_yap_bound(a, an));
}

static inline int thull_yap_is_small_2(mp_ptr a, mp_size_t an)
{
#if DEBUG_TY >= 3
    printf("TY_IsSmall2: a:="); MPN_PRINT(a, an);
    printf("; norm=%lu bound=%lu\n",
	   mpn_ty_norm(a, an), get_small_threshold());
#endif
    return mpn_gcd_is_norm_lt(a, an, get_small_threshold());
}

/* INPUT: an >= m?
          a1 has <= m+1 digits
	  a0 has <= an-m+1 
   Write a = a0*BASE^m-a1, a0 = 1 + (a div B^m),
   a1 = a0*BASE^m-a = BASE^m-(a mod BASE^m)

 */
static void thull_yap_split(mp_ptr a0, mp_size_t *a0n,
			    mp_ptr a1, mp_size_t *a1n,
			    mp_ptr a, mp_size_t an, mp_size_t m)
{
    if(a0 != NULL){
	/* first compute a div B^m */
	if(an >= m){
	    /* a0 = a[m..an[ */
	    MPN_COPY(a0, a+m, an-m);
	    *a0n = an-m;
	}
	else{
	    /* a0 = a div B^m = 0 */
	    a0[0] = 0;
	    *a0n = 1;
	}
	/* add 1 */
	if(mpn_add_1(a0, a0, *a0n, 1) != 0){
	    a0[*a0n] = 1;
	    *a0n += 1;
	}
    }
    if(a1 != NULL){
	a1[m] = 1;
	MPN_ZERO(a1, m);
	a1[m] -= mpn_sub_n(a1, a1, a, m);
	*a1n = m+1;
	MPN_NORMALIZE(a1, *a1n);
    }
}
 
#if DEBUG_TY >= 1
static void thull_yap_enter(mp_ptr a, mp_size_t an, mp_ptr b, mp_size_t bn)
{
    printf("-----> {%d} a:=", ty_level); MPN_PRINT(a, an);
    printf(";\n{%d} b:=", ty_level); MPN_PRINT(b, bn);
    printf(";\n");
}

static int thull_yap_check_as_bs(mp_ptr as, mp_size_t asn,
				 mp_ptr bs, mp_size_t bsn,
				 mp_ptr ap, mp_size_t apn,
				 mp_ptr bp, mp_size_t bpn, regular_t R)
{
    int ok = 1;

    if(apn != asn || mpn_cmp(ap, as, asn) != 0){
	printf("check_error: a'="); MPN_PRINT(ap, apn);
	printf("\na*="); MPN_PRINT(as, asn);
	printf("\n");
	ok = 0;
    }
    if(bpn != bsn || mpn_cmp(bp, bs, bsn) != 0){
	printf("check_error: b'="); MPN_PRINT(bp, bpn);
	printf("\nand not\nb*="); MPN_PRINT(bs, bsn);
	printf("\n");
	ok = 0;
    }
    if(ok == 0){
	qseq_print(R->lq);
	printf(";\n");
    }
    return ok;
}

MAYBE_UNUSED
static int TY_check_as_bs(mpz_t as, mpz_t bs, mpz_t ap, mpz_t bp, regular_t R)
{
    return thull_yap_check_as_bs(PTR(as), mpz_size(as), PTR(bs), mpz_size(bs),
			      PTR(ap), mpz_size(ap), PTR(bp), mpz_size(bp),
			      R);
}

#if DEBUG_TY >= 2
/** 
    We should have [a, b] = (M|Q) [a*, b*] with M == Q.
*/
static void thull_yap_exit(mp_ptr a, mp_size_t an,
			   mp_ptr b, mp_size_t bn,
			   regular_t R,
			   mp_ptr as, mp_size_t asn,
			   mp_ptr bs, mp_size_t bsn,
			   int dolastmul, int level)
{
    mpz_t za, zb, zap, zbp;
    mp_size_t m = thull_yap_bound(a, an);

#if DEBUG_TY >= 1
    printf("lq:="); qseq_print(R->lq); printf(";\n");
#endif
    /* in fast, this is clearer, since we have computed m already */
    if(mpn_gcd_is_norm_lt(a, an, m) != 0){
	assert(qseq_is_empty(R->lq));
	return;
    }
    if(level || dolastmul){
	printf("Q from lq: %d\n", check_Q_from_lq(R->Q, R->lq));
    }
    mpz_inits(zap, zbp, NULL);
    mpz_init_set_ui(za, 0);
    MPZ_SET_MPN(za, a, an);
    mpz_init_set_ui(zb, 0);
    MPZ_SET_MPN(zb, b, bn);
    
    gmp_printf("thull_yap_exit: za:=%Zd;\nzb:=%Zd;\n", za, zb);
    printf("{%d} pre_exit: as:=", ty_level); MPN_PRINT(as, asn);
    printf("; bs:="); MPN_PRINT(bs, bsn);
    printf(";\n");
    mpz_qseq_apply_inverse(zap, zbp, R->lq, za, zb);
    gmp_printf("thull_yap_exit: zap:=%Zd;\nzbp:=%Zd;\n", zap, zbp);
    assert(mpz_sgn(zbp) >= 0);
    assert(thull_yap_check_as_bs(as, asn, bs, bsn,
			      PTR(zap), mpz_size(zap),
			      PTR(zbp), mpz_size(zbp), R));
    /* ||a'|| >= m? */
    assert(mpn_gcd_is_norm_ge(PTR(zap), (mp_size_t)mpz_size(zap), m));
    /* ||b'|| < m? */
    assert(mpn_gcd_is_norm_lt(PTR(zbp), (mp_size_t)mpz_size(zbp), m));
    printf("<----- {%d} a:=", ty_level); MPN_PRINT(a, an);
    printf(";\nb:="); MPN_PRINT(b, bn);
    printf(";");
#if DEBUG_TY >= 2
    printf(" lq:="); qseq_print(R->lq); printf(";\n");
#else
    printf("\n");
#endif
    mpz_clears(za, zb, zap, zbp, NULL);
}
#endif /* DEBUG_TY >= 2 */
#endif /* DEBUG_TY >= 1 */

/** INPUT: ||a'|| >= m+t, we have no information on ||b'||. 
           tp has size >= 2*apn+3
    SIDE-EFFECT: ||a*|| >= m+t > ||b*||.
    ap and bp are swapped.

    TODO: share tp again?

    FIXME: can it stand as = ap and bs = bp?
*/

/* ap and bp are modified 
   INPUT: tp of size >= 2*apn+3
   PRECONDITION : R->Q == R->lq
   POSTCONDITION: newQ == newlq => really?
   TODO: ap -> a, bp -> b, this is a primitive.

   FIXME: clean all these functions that end up calling gcd_small.

*/
static void thull_yap_advance(regular_t R,
			      mp_ptr ap, mp_size_t apn,
			      mp_ptr bp, mp_size_t bpn,
			      mp_size_t mpt, int imax,
			      mp_ptr tp, mp_size_t tp_alloc)
{
#if TIMINGS > 0
    double tt = runtime();
#endif
    mp_size_t n = apn;
    int Qlast = R->lq->last;

#if DEBUG_TY >= 200
    printf("Advancing by up to %d steps: mpt=%lu; floor(||b'||)=%lu\n",
	   imax, mpt, mpn_gcd_floor_norm(bp, bpn));
    printf("entering thull_yap_advance: Q[%d..%d[\n",
	   R->lq->first, R->lq->last);
    printf("----- checking M\n");
    check_Q_from_lq(R->Q, R->lq);
#endif
    assert(apn >= bpn);
    bp[bpn] = 0;
    gcd_small(R, ap, apn, bp, bpn, mpt, NULL, 0, tp, tp_alloc, 0);
#if DEBUG_TY >= 1
    printf("after gcd_small: R=="); regular_print(R); printf(";\n");
#endif
    bpn = bp[apn];
    apn = ap[apn];
#if DEBUG_TY >= 1
    printf("advance2: [%lu]ap:=", apn); MPN_PRINT(ap, apn);
    printf(";\n[%lu]bp:=", bpn); MPN_PRINT(bp, bpn);
    printf(";\n");
#endif
    /* not more iterations than predicted */
    assert(R->lq->last - Qlast <= imax);
    /* check bounds */
    assert(mpn_gcd_is_norm_ge(ap, apn, mpt));
    assert(mpn_gcd_is_norm_lt(bp, bpn, mpt));
    ap[n] = apn;
    bp[n] = bpn;
#if TIMINGS > 0
    fprintf(stderr, "{%d} advance: %lf\n", ty_level, runtime()-tt);
#endif
}

#define NEW_A_v 1 /* means we allocate A and v with the same size */

#if 0
/* TODO: change this M to something for-able 
   SIDE-EFFECT: create A[0..4[ with the same length, and v[0..2[ with the
                same even length, to be able to halve it
*/
MAYBE_UNUSED
static void feed_A_v(mp_ptr *A, mp_size_t *nA,
		     mp_ptr *v, mp_size_t *nv, mat2x2_t M,
		     mp_ptr a1, mp_size_t a1n, mp_ptr b1, mp_size_t b1n)
{
    mp_size_t len;
    int i;

    /* make nv and nA even, so that we can split everybody easily */
    *nv = max(a1n, b1n);
    if(((*nv) & 1) == 1)
	*nv += 1;
    for(i = 0; i < 2; i++)
	v[i] = (mp_ptr)malloc((*nv) * sizeof(mp_limb_t));
    MPN_COPY(v[0], a1, a1n);
    MPN_ZERO(v[0]+a1n, *nv-a1n);
    MPN_COPY(v[1], b1, b1n);
    MPN_ZERO(v[1]+b1n, *nv-b1n);
    /* we create enough space in A[i] */
    *nA = mpz_size(M->a);
    *nA = max(*nA, (mp_size_t)mpz_size(M->b));
    *nA = max(*nA, (mp_size_t)mpz_size(M->c));
    *nA = max(*nA, (mp_size_t)mpz_size(M->d));
    len = *nA+(*nv)+1;
    for(i = 0; i < 4; i++)
	A[i] = (mp_ptr)malloc(len * sizeof(mp_limb_t));
    MPN_COPY(A[0], PTR(M->a), mpz_size(M->a));
    MPN_ZERO(A[0]+mpz_size(M->a), len-mpz_size(M->a));
    MPN_COPY(A[1], PTR(M->b), mpz_size(M->b));
    MPN_ZERO(A[1]+mpz_size(M->b), len-mpz_size(M->b));
    MPN_COPY(A[2], PTR(M->c), mpz_size(M->c));
    MPN_ZERO(A[2]+mpz_size(M->c), len-mpz_size(M->c));
    MPN_COPY(A[3], PTR(M->d), mpz_size(M->d));
    MPN_ZERO(A[3]+mpz_size(M->d), len-mpz_size(M->d));
}
#endif

/* OUTPUT: sign of as-w0.
   SIDE-EFFECT: as -= w0; bs += w1 
*/
static int sub_add(mp_ptr as, mp_size_t *p_asn,
		   mp_ptr w0, mp_size_t  w0n,
		   mp_ptr bs, mp_size_t *p_bsn,
		   mp_ptr w1, mp_size_t  w1n)
{
    mp_size_t asn = *p_asn, bsn = *p_bsn;
    int sg = 1;
    
    if(w0n > asn || (w0n == asn && mpn_cmp(w0, as, w0n) > 0)){
	/* as = w0-as */
	if(asn != 0){
	    if(mpn_sub_n(w0, w0, as, asn) == 1)
		assert(mpn_sub_1(w0+asn, w0+asn, w0n-asn, 1) == 0);
	    MPN_NORMALIZE(w0, w0n);
	}
	MPN_COPY(as, w0, w0n);
	asn = w0n;
	sg = -1;
    }
    else{
	if(w0n != 0){
	    if(mpn_sub_n(as, as, w0, w0n) == 1)
		assert(mpn_sub_1(as+w0n, as+w0n, asn-w0n, 1) == 0);
	    MPN_NORMALIZE(as, asn);
	}
	sg = 1;
    }
    /* bs += w1 */
#if DEBUG_TY >= 2
    printf("bs:="); MPN_PRINT(bs, bsn); printf(";\n");
    printf("w1:="); MPN_PRINT(w1, w1n); printf(";\n");
#endif
    if(bsn == 0){
	MPN_COPY(bs, w1, w1n);
	bsn = w1n;
    }
    else{
	if(bsn >= w1n){
	    if(w1n != 0){
		if(mpn_add_n(bs, bs, w1, w1n) == 1){
		    /* we need to propagate carry: bs[w1n..[ += 1 */
#if DEBUG_TY >= 2
		    printf("bs+w1="); MPN_PRINT(bs, w1n); printf(";\n");
#endif
		    if(bsn == w1n)
			/* special case */
			bs[bsn++] = 1;
		    else{
			if(mpn_add_1(bs+w1n, bs+w1n, bsn-w1n, 1) == 1){
#if DEBUG_TY >= 1 || PRINT_WARNINGS >= 1
			    printf("WARNING improbable carry for bs\n");
#endif
			    bs[bsn++] = 1;
			}
		    }
		}
	    }
	}
	else{ /* bsn < w1n, hoping for bs[bsn..[ to be 0 */
	    MPN_ZERO(bs+bsn, w1n-bsn);
	    bsn = w1n;
	    if(mpn_add_n(bs, bs, w1, w1n) == 1){
#if DEBUG_TY >= 1 || PRINT_WARNINGS >= 1
		printf("WARNING improbable carry2 for bs\n");
#endif
		bs[bsn++] = 1;
	    }
#if DEBUG_TY >= 1
	    printf("bs:="); MPN_PRINT(bs, bsn); printf(";\n");
#endif
	}
    }
    *p_asn = asn;
    *p_bsn = bsn;
    return sg;
}

/* INPUT: a, b > 0; a0p and b0p should have a "large enough size" (!)
          at least size(a0p) >= an and size(b0p) >= bn.
	  is_ab == 1 for the first call (hence original a and b),
	           0 for the second call
   PRECONDITION: R->Q == lq
   OUTPUT: sign(bs) 
   SIDE-EFFECT: (as, bs) <- Q^{-1} (a = a0*B^m-a1, b = b0*B^m+b1) given that
                (a0', b0') = Q^{-1} (a0, b0)
		as should be >= 0, but it may be that bs < 0
		as[an] <- asn; bs[an] <- bsn
   REM: we use as == a0p, bs == b0p
   FIXME: we might have as = a, bs = b...
   REM: R is not modified.
*/
static int thull_yap_extend(regular_t R,
			    mp_ptr a0p, mp_size_t a0pn,
			    mp_ptr b0p, mp_size_t b0pn,
			    mp_ptr a, mp_size_t an,
			    mp_ptr b, mp_size_t bn,
			    mp_size_t m, size_t t, int is_ab,
			    mp_ptr tp)
{
#if TIMINGS >= 1
    double tt;
#endif
    int delta = regular_determinant(R);
    mp_ptr a1, b1, w[2];
    mp_ptr as = a0p, bs = b0p;
    mp_size_t a1n = 0, b1n, asn, bsn, wn[2];
    int i, sbs;

    if(qseq_is_empty(R->lq)){
	MPN_COPY(as, a, an);
	MPN_COPY(bs, b, bn);
	as[an] = an;
	bs[an] = bn;
	return 1;
    }
    /* FIXME: pb with t == 0? */
    assert(t != 0);
#if DEBUG_TY >= 1
    printf("thull_yap_extend1\n");
# if DEBUG_TY >= 200
    check_Q_from_lq(R->Q, R->lq);
# endif
    printf("--> thull_yap_extend\n");
    printf("a0p:="); MPN_PRINT(a0p, a0pn); printf(";\n");
    printf("b0p:="); MPN_PRINT(b0p, b0pn); printf(";\n");
    printf("lq:="); qseq_print(R->lq); printf(";\n");
#endif    
#if TIMINGS >= 1
    tt = runtime();
#endif
    a1 = tp;
    /* a0 <- 1 + (a div BASE^m), a1 = a0*BASE^m-a */
    thull_yap_split(NULL, 0, a1, &a1n, a, an, m);
    /* b1 <- b mod BASE^m */
    assert(bn >= m);
    b1 = b;
    b1n = m;
    MPN_NORMALIZE(b1, b1n);
#if DEBUG_TY >= 1
    fprintf(stderr, "{%d} thull_yap_extend: a1n=%lu b1n=%lu\n",
	    ty_level, a1n, b1n);
#endif
#if DEBUG_TY >= 2
    printf("new_extend1: a:="); MPN_PRINT(a, an);
    printf(";\nb:="); MPN_PRINT(b, bn);
    printf(";\n");
    printf("new_extend1: [%lu]a1:=", a1n); MPN_PRINT(a1, a1n);
    printf(";\n[%lu]b1:=", b1n); MPN_PRINT(b1, b1n);
    printf(";\n");
#endif
    thull_yap_compute_special_product(w, wn, R, a1, a1n, b1, b1n, is_ab);
    /* build ap and bp */
    /* FIXME: perhaps we don't need two different sets of variables? */
    /* using memmove enables overlaps such as when as = a0p */
    memmove(as+m, a0p, a0pn * sizeof(mp_limb_t));
    MPN_ZERO(as, m);
    asn = m+a0pn;
    MPN_NORMALIZE(as, asn);
    memmove(bs+m, b0p, b0pn * sizeof(mp_limb_t));
    MPN_ZERO(bs, m);
    bsn = m+b0pn;
    MPN_NORMALIZE(bs, bsn);
    /* (as; bs) -= delta * (w[0]; -w[1]) 
       delta =  1: as -= w[0]; bs += w[1]; sub_add(as, w[0], bs, w[1]);
       delta = -1: bs -= w[1]; as += w[0]; sub_add(bs, w[1], as, w[0]);
     */
    if(delta == 1){
	assert(asn >= wn[0]);
#if 1
	assert((sbs = sub_add(as, &asn, w[0], wn[0], bs, &bsn, w[1], wn[1])) == 1);
#else
	/* as -= w0, result > 0 */
	if(mpn_sub_n(as, as, w[0], wn[0]) == 1)
	    assert(mpn_sub_1(as+wn[0], as+wn[0], asn-wn[0], 1) == 0);
	MPN_NORMALIZE(as, asn);
	/* bs += w1 */
#if DEBUG_TY >= 2
	printf("bs:="); MPN_PRINT(bs, bsn); printf(";\n");
	printf("w1:="); MPN_PRINT(w[1], wn[1]); printf(";\n");
#endif
	if(bsn == 0){
	    MPN_COPY(bs, w[1], wn[1]);
	    bsn = wn[1];
	}
	else{
	    if(bsn >= wn[1]){
		if(mpn_add_n(bs, bs, w[1], wn[1]) == 1){
		    /* we need to propagate carry: bs[wn[1]..[ += 1 */
#if DEBUG_TY >= 2
		    printf("bs+w1="); MPN_PRINT(bs, wn[1]); printf(";\n");
#endif
		    if(bsn == wn[1])
			/* special case */
			bs[bsn++] = 1;
		    else{
			if(mpn_add_1(bs+wn[1], bs+wn[1], bsn-wn[1], 1) == 1){
#if DEBUG_TY >= 1 || PRINT_WARNINGS >= 1
			    printf("WARNING improbable carry for bs\n");
#endif
			    bs[bsn++] = 1;
			}
		    }
		}
	    }
	    else{ /* bsn < wn[1], hoping for bs[bsn..[ to be 0 */
		MPN_ZERO(bs+bsn, wn[1]-bsn);
		bsn = wn[1];
		if(mpn_add_n(bs, bs, w[1], wn[1]) == 1){
#if DEBUG_TY >= 1 || PRINT_WARNINGS >= 1
		    printf("WARNING improbable carry2 for bs\n");
#endif
		    bs[bsn++] = 1;
		}
#if DEBUG_TY >= 1
		printf("bs:="); MPN_PRINT(bs, bsn); printf(";\n");
#endif
	    }
	}
#endif
    } /* delta == 1 */
    else{
#if 1
	sbs = sub_add(bs, &bsn, w[1], wn[1], as, &asn, w[0], wn[0]);
#else
	/* as += w0 */
	assert(asn >= wn[0]);
	if(wn[0] != 0){
	    if(mpn_add_n(as, as, w[0], wn[0]) == 1)
		if(mpn_add_1(as+wn[0], as+wn[0], asn-wn[0], 1) == 1){
#if DEBUG_TY >= 1 || PRINT_WARNINGS >= 1
		    printf("WARNING improbable carry for as\n");
#endif
		    as[asn++] = 1;
		}
	}
	/* bs -= w1; result may be < 0; FIXME: can we have 0? */
	if(wn[1] > bsn || (wn[1] == bsn && mpn_cmp(w[1], bs, wn[1]) > 0)){
	    if(bsn != 0){
		if(mpn_sub_n(w[1], w[1], bs, bsn) == 1)
		    assert(mpn_sub_1(w[1]+bsn, w[1]+bsn, wn[1]-bsn, 1) == 0);
		MPN_NORMALIZE(w[1], wn[1]);
	    }
	    MPN_COPY(bs, w[1], wn[1]);
	    bsn = wn[1];
	    sbs = -1;
	}
	else{
	    if(wn[1] != 0){
		if(mpn_sub_n(bs, bs, w[1], wn[1]) == 1)
		    assert(mpn_sub_1(bs+wn[1], bs+wn[1], bsn-wn[1], 1) == 0);
		MPN_NORMALIZE(bs, bsn);
	    }
	    sbs = 1;
	}
#endif
    }
#if DEBUG_TY >= 1
    if(sbs == -1)
	printf("new bs is < 0\n");
#endif
    assert(asn <= an);
    as[an] = asn;
    assert(bsn <= an);
    bs[an] = bsn;
    for(i = 0; i < 2; i++)
	free(w[i]);
#if DEBUG_TY >= 1
    printf("ext_ap:="); MPN_PRINT(as, asn); printf(";\n");
    printf("ext_bp:="); if(sbs == -1) printf("-");
    MPN_PRINT(bs, bsn); printf(";\n");
#endif
    return sbs;
}

/** INPUT:
    PRECONDITION : R->Q == R->lq, an >= bn >= m and we might be quite far.
    POSTCONDITION: ||a*|| >= m > ||b*||; R->Q == R->lq.
    SIDE-EFFECT: (as, bs) <- Ms * (a, b); as[an] and bs[an] contain the lengths
    REM: this function is called when parameters an and bn are small.
    FIXME: why stick at that and/or simplify w.r.t. advance? Tests should be
           reserved to advance?
*/
static void thull_yap_fixup_0(mp_ptr as, mp_ptr bs,
			      regular_t R,
			      mp_ptr a, mp_size_t an,
			      mp_ptr b, mp_size_t bn,
			      mp_size_t m, mp_ptr tp, mp_size_t tp_alloc)
{
#if TIMINGS > 0
    double tt = runtime();
#endif
    int imax = TY_LG_BASE * (an-m); /* quite a large upper bound? */
    
#if DEBUG_TY >= 200
    printf("fixup_0: m=%lu Norm(a)=%lu Norm(b)=%lu\n", m,
	   mpn_gcd_floor_norm(a, an), mpn_gcd_floor_norm(b, bn));
    check_Q_from_lq(R->Q, R->lq);
#endif
    if(as != a)
	MPN_COPY(as, a, an);
    as[an] = 0;
    if(bs != b)
	MPN_COPY(bs, b, bn);
    MPN_ZERO(bs+bn, an-bn+1);
    thull_yap_advance(R, as, an, bs, bn, m, imax, tp, tp_alloc);
#if DEBUG_TY >= 200
    printf("fixup_0\n");
    check_Q_from_lq(R->Q, R->lq);
#endif
#if TIMINGS > 0
    fprintf(stderr,"{%d} fixup_0[advance]: %lu -> %lu %lf\n",
	    ty_level, an, as[an], runtime()-tt);
#endif
}

/* INPUT: Q = <q1, ..., qk>; 
   PRECONDITION: ?
   (a, b) <- (qk * a + b, a)
   (M, Q) <- (M / <qk>, Q minus <qk>)
   POSTCONDITION: ?

   FIXME: roll_back vs. back_up? vs. oslash?

*/
static void gcd_roll_back(regular_t R,
			  mp_ptr a, mp_size_t *an,
			  mp_ptr b, mp_size_t *bn,
			  mp_ptr tp, mp_size_t tp_alloc)
{
    mp_size_t tpn;
	
    assert(qseq_is_empty(R->lq) == 0);
#if DEBUG_TY >= 1
    printf("Q_roll_back:="); qseq_print(R->lq); printf(";\n");
    printf("a:="); MPN_PRINT(a, *an); printf(";\n");
    printf("b:="); MPN_PRINT(b, *bn); printf(";\n");
#endif
#if 0 // deprecated...!
    MPN_COPY(tp, b, *bn);
    tpn = *bn;
    /* tp <- b + a * q */
    /* TODO: add this part in oslash as well? Not clear */
    mpn_qseq_addmul_last(tp, &tpn, a, *an, R->lq);
    assert(tpn <= tp_alloc);
    MPN_COPY(b, a, *an);
    *bn = *an;
    MPN_COPY(a, tp, tpn);
    *an = tpn;
    regular_oslash(R, tp, tp_alloc);
#else
    euclid_step_back(R, a, an, b, bn, tp, tp_alloc);
#endif
#if DEBUG_TY >= 200
    printf("end of gcd_roll_back\n");
    check_Q_from_lq(R->Q, R->lq);
    qseq_print(R->lq); printf(";\n");
    printf("newa:="); MPN_PRINT(a, *an); printf(";\n");
    printf("newb:="); MPN_PRINT(b, *bn); printf(";\n");
#endif
}

/**
   INPUT: ||a0'|| >= t > ||b0'||.
   SIDE-EFFECT: (as, bs) = M^{-1} (a, b) with as[an] = asn, bs[an] = bsn
                and ||as|| >= m+t > ||bs||.
   REM: stands as == a0p and bs == b0p.

   TODO: extend should have been called before...!
   So that we should replace a0p and b0p by ap and bp and sbp = sign(bp)

   FIXME: can we have as = ap and bs = bp?

*/

/**
   INPUT: (ap, sbp*bp) = M^{-1} (a, b) with sbp = 1 or -1.
          size(tp) >= an+1
   POSTCONDITION: ||as|| >= m+t > ||bs||
   SIDE-EFFECT: as[an] = asn, bs[an] = bsn

   FIXME: can we have as = ap and bs = bp?

   We could be operating on ap, bp without as, bs.

*/

/**
   INPUT: (ap, bp) = M^{-1} (a; b) that might be corrected.
   PRECONDITION : M == Q
   POSTCONDITION: newM == newQ
   (ap; bp) <- newM^{-1} * (a; b)
   FIXME: (a; b) should not be modified?
   REM: suppress as and bs definitively.
   TODO: since (a, b) are not really used, we could be able
         to do ap = a, bp = b.
*/
static void thull_yap_fixup(regular_t R,
			    mp_ptr ap, mp_size_t apn,
			    mp_ptr bp, mp_size_t bpn, int sbp,
			    mp_ptr a, mp_size_t an,
			    mp_ptr b, mp_size_t bn,
			    mp_size_t m, mp_size_t t,
			    mp_ptr tp, mp_size_t tp_alloc)
{
#if TIMINGS >= 2
    double tt;
#endif
    mp_ptr tmp1 = NULL, tmp2 = NULL, tmp3 = NULL;
#if 0
    mp_ptr as = ap, bs = bp; /* TMP!!! */
#endif
    mp_size_t asn, bsn, tmp2n, tmp3n;
    int detM = regular_determinant(R), det;

#if DEBUG_TY >= 1
    printf("Entering Fixup with m=%lu t=%lu R==", m, t);
    regular_print(R); printf(";\n");
    printf("--> fixup:\na:="); MPN_PRINT(a, an);
    printf(";\nb:="); MPN_PRINT(b, bn);
    printf(";\nm=%lu t=%lu det=%d\n", m, t, detM);
    printf("floor(||a||)=%lu floor(||b||)=%lu\n",
	   mpn_gcd_floor_norm(a, an), mpn_gcd_floor_norm(b, bn));
    printf("Fixup:\nap:="); MPN_PRINT(ap, apn);
    printf(";\nbp:="); if(sbp == -1) printf("-"); MPN_PRINT(bp, bpn);
    printf(";\n");
#endif
    if(t == 0){
#if DEBUG_TY >= 200
	printf("fixup_0 from fixup\n");
	check_Q_from_lq(R->Q, R->lq);
#endif
	/* in that case, Q = <>? */
#if DEBUG_TY >= 1
	printf("R_{t=0}:="); regular_print(R); printf(";\n");
#endif
	assert(qseq_is_empty(R->lq));
	/* stands ap = a, bp = b */
	thull_yap_fixup_0(ap, bp, R, a, an, b, bn, m, tp, tp_alloc);
#if DEBUG_TY >= 200
	check_Q_from_lq(R->Q, R->lq);
#endif
	return;
    }
    /* bp might be used in (+A) and ap in (+B):
       in both places, we could need some extra space 
       TODO: does this mean we don't need to zero these every time?
             And gain some cycles?
    */
    if(bpn < apn)
	MPN_ZERO(bp+bpn, apn-bpn);
    else
	MPN_ZERO(ap+apn, bpn-apn);
#if TIMINGS >= 2
    tt = runtime();
#endif
    if(detM == -1){
	/* leaf0 */
	// (-A) if b' >= 0 then M* = M
	if(sbp >= 0){
	    /* leaf00 */
#if DEBUG_TY >= 1 || PRINT_WARNINGS >= 1
	    printf("WARNING fixup: (-A) => return Q\n");
#endif
	    ap[an] = apn;
	    bp[an] = bpn;
	}
	else{
	    /* leaf01 */
	    /* b' < 0 */
	    // (-B) elif ||a'+b'|| >= m+t then M* = toggle(M)
	    // FIXME: are we sure that a'+b' >= 0? yes if (-B)
	    // b0p w.r.t. w???
	    /* tmp1 <- a'+b' */
	    assert(apn >= bpn);
	    /* FIXME: can we reorganize this to save copies? */
	    tmp1 = (mp_ptr)malloc((apn+1) * sizeof(mp_limb_t));
	    assert(mpn_sub(tmp1, ap, apn, bp, bpn) == 0);
	    asn = apn;
	    MPN_NORMALIZE(tmp1, asn);
#if DEBUG_TY >= 1
	    printf("newas:="); MPN_PRINT(tmp1, asn); printf(";\n");
	    printf("Norm(a'+b') >= m+t=%lu? %d\n",
		   m+t, mpn_gcd_is_norm_ge(tmp1, asn, m+t));
#endif
	    if(mpn_gcd_is_norm_ge(tmp1, asn, m+t) != 0){
		/* leaf010: ||as|| >= m+t */
#if DEBUG_TY >= 1 || PRINT_WARNINGS >= 1
		printf("WARNING fixup: (-B) => Q* = toggle(Q)\n");
#endif
		/* M = <q1, ..., qk>; tmp2 <- qk */
		tmp2 = mpn_qseq_get_last(&tmp2n, R->lq);
#if DEBUG_TY >= 2
		printf("q_%d:=", (int)R->lq->last-1); MPN_PRINT(tmp2, tmp2n);
		printf(";\n");
#endif
		/* TODO: where are the values to be updated? */
		regular_oslash(R, tp, tp_alloc);
		if(tmp2n > 1 || tmp2[0] > 1){
#if DEBUG_TY >= 1 || PRINT_WARNINGS >= 1
		    printf("WARNING fixup: leaf0100 = (-B) + q > 1\n");
#endif
		    /* qk > 1: M* = <q1, ..., q_{k-1}, q_k-1, 1>
		       a'' = qk*a'+b' = (qk-1)*a'+(a'+b') 
		       a'  = 1*(a'+b')-b'
		       a*  = a'+b', b* = -b'
		     */
		    mpn_sub_1(tmp2, tmp2, tmp2n, 1);
		    MPN_NORMALIZE(tmp2, tmp2n);
		    /* bs <- abs(bp) */
		    MPN_COPY(ap, tmp1, asn);
		    ap[an] = asn; /* FIXME: factor all this? */
		    bp[an] = bpn;
		    /* TODO: add the iterates? */
		    regular_omul_q(R, tmp2, tmp2n, tp, tp_alloc);
		    tmp2[0] = 1;
		    regular_omul_q(R, tmp2, 1, tp, tp_alloc);
		}
		else{
		    /* leaf0101 */
#if DEBUG_TY >= 1 || PRINT_WARNINGS >= 1
		    printf("WARNING fixup: leaf0101 = (-B) + q = 1\n");
#endif
		    /* qk = 1: M* = <q1, ..., q_{k-2}, q_{k-1}+1> 
		       a'' = 1*a' + b'
		       a''' = q_{k-1}*a''+a' = (q_{k-1}+1)*a''+a'-a''
		            = (q_{k-1}+1)*a''-b'
		     */
		    tmp3 = mpn_qseq_get_last(&tmp3n, R->lq);
#if DEBUG_TY >= 1
		    printf("last is "); MPN_PRINT(tmp3, tmp3n); printf(";\n");
#endif
		    if(mpn_add_1(tmp3, tmp3, tmp3n, 1) == 1){
#if DEBUG_TY >= 1 || PRINT_WARNINGS >= 1
			/* TODO? */
			printf("WARNING fixup: leaf01010 = (-B)+carry!\n");
#endif
			tmp3 = realloc(tmp3, (tmp3n+1)*sizeof(mp_limb_t));
			tmp3[tmp3n++] = 1;
		    }
		    /* bs <- abs(bp); FIXME: factor? */
		    MPN_COPY(ap, tmp1, asn);
		    ap[an] = asn; /* FIXME: factor all this? */
		    bp[an] = bpn;
		    regular_oslash(R, tp, tp_alloc);
		    regular_omul_q(R, tmp3, tmp3n, tp, tp_alloc);
		    free(tmp3);
		}
		free(tmp2);
	    } /* conditionI */
	    else{
		/* leaf011: ||as|| < m+t  */
		/* tmp1 already contains ap+bp, and morally bs = -bp */
		tmp2 = mpn_qseq_get_last(&tmp2n, R->lq);
		regular_oslash(R, tp, tp_alloc);
		/* tmp2 >= 1 */
		if(tmp2n > 1 || tmp2[0] >= 2){
		    /* leaf0110 */
		    /* qk >= 2 then M* = <q1,..., q_{k-1}, q_k-1> 
		       a'' = qk*a'+b' = (qk-1)*a'+(a'+b')
		     */
#if DEBUG_TY >= 1 || PRINT_WARNINGS >= 1
		    printf("WARNING fixup: (-C)\n");
#endif
		    mpn_sub_1(tmp2, tmp2, tmp2n, 1);
		    MPN_NORMALIZE(tmp2, tmp2n);
		    MPN_COPY(bp, tmp1, asn);
		    bp[an] = asn;
		    ap[an] = apn;
		    regular_omul_q(R, tmp2, tmp2n, tp, tp_alloc);
		}
		else{
		    /* leaf0111: tmp2 == 1 */
		    /* (-D) else M* backing up of M by two steps, but one
		       was already performed */
#if DEBUG_TY >= 1 || PRINT_WARNINGS >= 1
		    printf("WARNING fixup: (-D) => backup\n");
#endif
		    /* remember bs = -bp */
		    /* FIXME: simplify this to save these copies? */
		    /* (bp, ap) <- (a', a'+b') */
		    MPN_COPY(bp, ap, apn); /* bp <- ap */
		    MPN_COPY(ap, tmp1, asn); /* ap <- tmp1=a'+b' */
		    /* at this point, as < bs */
#if DEBUG_TY >= 3
		    printf("[%lu]as:=",asn); MPN_PRINT(as, asn); printf(";\n");
		    printf("[%lu]bs:=",apn); MPN_PRINT(bs, apn); printf(";\n");
		    printf("[%lu]q:=", qseq_nl(R->lq, R->lq->last-1));
		    qseq_print_cell(R->lq, R->lq->last-1); printf(";\n");
#endif
		    bsn = apn;
		    /* (as, bs) = (bs + qk * as , as) */
		    gcd_roll_back(R, ap, &asn, bp, &bsn, tp, tp_alloc);
		    /* we should get ||as|| >= m+t > ||bs|| */
		    MPN_NORMALIZE(ap, asn);
		    MPN_NORMALIZE(bp, bsn);
		    ap[an] = asn;
		    bp[an] = bsn;
#if DEBUG_TY >= 3
		    printf("[%lu]Xas:=",asn); MPN_PRINT(ap,asn); printf(";\n");
		    printf("[%lu]Xbs:=",bsn); MPN_PRINT(bp,bsn); printf(";\n");
#endif
		}
		free(tmp2);
	    }
	    free(tmp1);
	}
    }
    else{ /* detM = 1 */
	/* this is a theoretical comparison of true norms, hence comparison */
#if 0
	printf("||a0'||=%lu >= t=%lu > ||b0'||=%lu; ||a'||=%lu, m+t=%lu\n",
	       mpn_ty_norm(a0p, a0pn), t, mpn_ty_norm(b0p, b0pn),
	       mpn_ty_norm(ap, apn), m+t);
#endif
	if((apn < bpn) || ((apn == bpn) && mpn_cmp(ap, bp, apn) <= 0)){
	    /* (+A) if ||a'|| <= ||b'|| then M* adv of <q1, ..., q_{k-1}>
	       by at most two steps
	       => roll a' and b' back (see proof...):
	       a'' = a'*qk+b' > a'.
	       Cannot happen for M = E
	       leaf10
	    */
#if DEBUG_TY >= 1 || PRINT_WARNINGS >= 1
	    printf("WARNING fixup: (+A) => roll back + adv\n");
#endif
	    /* FIXME: not clear for 2nd version */
	    /* (bp, ap) <- (ap + qk * bp, bp) */
	    gcd_roll_back(R, ap, &apn, bp, &bpn, tp, tp_alloc);
	    assert(bpn <= an+2); /* we were taking some extra space... */
	    bp[bpn] = 0;
	    /* bp = a'' */
	    ap[apn] = 0;
	    /* FIXME: * TY_LG_BASE ?; */
	    thull_yap_advance(R, ap, apn, bp, bpn, m+t, 2, tp, tp_alloc);
	    asn = ap[apn];
	    bsn = bp[apn];
	    ap[an] = asn;
	    bp[an] = bsn;
	}
	else if(mpn_gcd_is_norm_lt(ap, apn, m+t)){
	    /* (+B) elif ||b'|| < ||a'|| < m+t then 
	       M* is the backing up of M by one 
	       or two steps. Cannot happen for M = E. Really?
	       leaf11
	    */
	    assert(t != 0); /* FIXME: why again? */
	    if(qseq_is_empty(R->lq) != 0){
#if DEBUG_TY >= 1
		printf("fixup: ||a'|| < m+t and M empty\n");
#endif
		assert(mpn_gcd_is_norm_lt(bp, bp[bpn], m));
		assert(0);
	    }
#if DEBUG_TY >= 1 || PRINT_WARNINGS >= 1
	    printf("WARNING fixup: (+B) => at most 2 backing up\n");
#endif
	    /* FIXME: this is backing up, no? if yes, share! */
	    /* first: try a'' = a'*qk+b' */
	    mpn_qseq_addmul_last(bp, &bpn, ap, apn, R->lq);
	    regular_oslash(R, tp, tp_alloc);
	    assert(bpn <= an+2); /* we were taking some extra space... */
	    bp[bpn] = 0;
	    if(mpn_gcd_is_norm_ge(bp, bpn, m+t) != 0){
#if DEBUG_TY >= 1 || PRINT_WARNINGS >= 1
		printf("WARNING fixup: leaf110 (+B) 1 step\n");
#endif
#if 0
		MPN_COPY(tp, bp, bpn);
		MPN_COPY(bp, ap, apn);
		MPN_COPY(ap, tp, bpn);
#else
		assert(bpn >= apn);
		MPN_SWAP(ap, bp, bpn);
		MPN_ZERO(bp+apn, bpn-apn);
#endif
		ap[an] = bpn;
		bp[an] = apn;
	    }
	    else{
#if DEBUG_TY >= 1 || PRINT_WARNINGS >= 1
		printf("WARNING fixup: leaf111 cannot exist\n");
#endif
		assert(0);
#if 0
		/* second: try a''' = a''*q_{k-1}+a' */
		mpn_qseq_addmul_last(ap, &apn, bp, bpn, Q);
		qseq_remove_last(Q);
		assert(apn <= an+2); /* we were taking some extra space... */
		ap[apn] = 0;
		assert(mpn_gcd_is_norm_ge(ap, apn, m+t) == 0);
		if(as != ap)
		    MPN_COPY(as, ap, apn);
		if(bs != bp)
		    MPN_COPY(bs, bp, bpn);
		as[an] = apn;
		bs[an] = bpn;
#endif
	    }
	}
	else{
	    /* (+C) else M* is the adv of M by at most two or four steps 
	       ||a'|| > ||b'|| and ||a'|| >= m+t
	    */
#if DEBUG_TY >= 1 || PRINT_WARNINGS >= 1
            printf("WARNING fixup: (+C) => adv by several steps\n");
#endif
#if DEBUG_TY >= 1
	    printf("lq[C+]:="); qseq_print(R->lq); printf(";\n");
#endif
#if 0
	    if(as != ap)
		MPN_COPY(as, ap, apn);
	    if(bs != bp)
		MPN_COPY(bs, bp, bpn);
#endif
	    thull_yap_advance(R, ap, apn, bp, bpn, m+t, 4*TY_LG_BASE,
			      tp, tp_alloc);
	    ap[an] = ap[apn];
	    bp[an] = bp[apn];
	}
    }
#if TIMINGS >= 2
    fprintf(stderr,"{%d} correcting: %lf\n", ty_level, runtime()-tt);
#endif
#if DEBUG_TY >= 200
    check_Q_from_lq(R->Q, R->lq);
#endif
}

/** INPUT: (a, an), (b, bn); a > b; size(a), size(b) >= an+1.
           ap, bp of size >= an+1; tp has size >= an+1
	   is_ab == 1 for the first call (hence original a and b),
	            0 for the second call
    PRECONDITION: R->Q == R->lq = Id
    SIDE-EFFECT: (ap; bp) = Q^{-1} (a; b) where (a0p; b0p) = Q^{-1}(a0; b0)
                 and ||a0'|| >= m + (*t) > ||b0'||.
		 ap[an], bp[an] contain the lengths of ap, bp.
		 a and b cannot be modified (since a0 = a+m, etc.).
    POSTCONDITION: R->Q == R->lq
 */
static void thull_yap_reduce(mp_ptr ap, mp_ptr bp,
			     regular_t R,
			     mp_size_t *p_t,
			     mp_ptr a, mp_size_t an,
			     mp_ptr b, mp_size_t bn,
			     mp_size_t m, int is_ab,
			     mp_ptr tp, mp_size_t tp_alloc,
			     int dolastmul, int level)
{
    mp_ptr a0, b0;
    mp_size_t a0n, b0n, apn, bpn, t;
#if DEBUG_TY >= 1
    int step = 4-2*is_ab;
    char ch_a0 = (is_ab == 1 ? 'a' : 'c');
    char ch_b0 = (is_ab == 1 ? 'b' : 'd');
#endif
#if TIMINGS > 0
    double tt;
#endif
    int sbp;

    is_ab = 0; /* always: since 222 does not seem to win! */
    
#if DEBUG_TY >= 100 /* doesn't work as is */
    printf("thull_yap_reduce: checking precondition\n");
    check_Q_from_lq(R->Q, R->lq);
#endif
#if TIMINGS > 0
    printf("{%d} reduce: an=%lu bn=%lu m=%lu\n", ty_level, an, bn, m);
#endif
    a0 = a+m;
    b0 = b+m;
    a0n = an-m;
    b0n = bn-m;
    if(mpn_add_1(a0, a0, a0n, 1) != 0)
	a0[a0n++] = 1;
#if DEBUG_TY >= 3
    printf("{%d}{%d} %c0:=", ty_level, step, ch_a0);
    MPN_PRINT(a0, a0n); printf(";\n");
    printf("{%d}{%d} %c0:=", ty_level, step, ch_b0);
    MPN_PRINT(b0, b0n); printf(";\n");
#endif
    t = thull_yap_bound(a0, a0n);
    *p_t = t;
    if(mpn_gcd_is_norm_lt(a0, a0n, t) != 0 /* ||a0|| < H(a0) = t: caset0 */
       || mpn_gcd_is_norm_lt(a, an, m+t) != 0){ /* ||a|| < m+t: caset1 */
	/* FIXME: are we sure caset2 exists independently of caset1? */
#if DEBUG_TY >= 1
	printf("||a0|| < t=%d: %d ||a|| < m+t: %d\n", (int)t,
	       mpn_gcd_is_norm_lt(a0, a0n, t),
	       mpn_gcd_is_norm_lt(a, an, m+t));
#endif
	   *p_t = 0;
    }
#if DEBUG_TY >= 2
    printf("[%d] sizes: %c0: %lu %c0: %lu\n", step, ch_a0, a0n, ch_b0, b0n);
#endif
#if DEBUG_TY >= 1
    gmp_printf("{%d}{%d} t:=%lu;\n", ty_level, step, t);
#endif
    /* this is (as, bs) for (a0, b0) and not (a, b)!!! */
    MPN_ZERO(b0+b0n, a0n-b0n);
    thull_yap_hgcd(ap, bp, R, a0, a0n, b0, b0n, dolastmul, level+1);
    /* for caset1: (ap, bp) = (a0, b0) and M = Id */
    if(*p_t != 0){
	/* at this point, ||ap|| >= t > ||bp|| */
	assert(mpn_gcd_is_norm_ge(ap, ap[a0n], (mp_limb_t)t));
	assert(mpn_gcd_is_norm_lt(bp, bp[a0n], (mp_limb_t)t));
    }
    else
	assert(hgcd_matrix_is_identity(R->Q)); // FIXME: why this?
    t = *p_t;
#if DEBUG_TY >= 200 /* doesn't work as is */
    printf("thull_yap_reduce: checking after rec calls to hgcd\n");
    check_Q_from_lq(R->Q, R->lq);
#endif
    /* put a0 back to its original value, so that a is correct in fixup... */
    mpn_sub_1(a0, a0, a0n, 1);
#if DEBUG_TY >= 1
    printf("{%d}{%d}: %c0s:=", ty_level, step, ch_a0); MPN_PRINT(ap, ap[a0n]);
    printf(";\n{%d}{%d}: %c0s:=", ty_level, step, ch_b0);
    MPN_PRINT(bp, bp[a0n]); printf(";\n");
#endif
    /* M of size BASE^(m/2) */
    /* also: [a', b'] <- M^{-1} [a, b] */
    apn = (mp_size_t)ap[a0n];
    bpn = (mp_size_t)bp[a0n];
#if TIMINGS > 0
    tt = runtime();
#endif
    /* at this point, R->Q == R->lq */
    /* TODO: simplify this call, since we only do that in-place */
    sbp = thull_yap_extend(R,ap,apn,bp,bpn,a,an,b,bn,m,t,is_ab,tp);
    /* if t == 0, (a, b) = (a', b') = (a_orig, b_orig) since M = E */
    bpn = bp[an];
    apn = ap[an];
    /* TODO: simplify this call, since we only do that in-place 
             -> reorganize args
     */
    /* if caset1 (M=E) force slow descent to m and get ||as|| >= m > ||bs|| 
       elif caset2: ||a|| < m+t
       else ordinary case: ||as|| >= m+t > ||bs||
     */
    thull_yap_fixup(R,ap,apn,bp,bpn,sbp,a,an,b,bn,m,t,tp,tp_alloc);
#if DEBUG_TY >= 1
    printf("after fixup: R:="); regular_print(R); printf(";\n");
#endif
#if TIMINGS > 0
    fprintf(stderr,"{%d} fixup_reduce: an=%lu bn=%lu nq=%d %lf\n",
	    ty_level, an, bn, qseq_card(R->lq), runtime()-tt);
#endif
    assert(mpn_gcd_is_norm_ge(ap, (mp_size_t)ap[an], m + t));
    assert(mpn_gcd_is_norm_lt(bp, (mp_size_t)bp[an], m + t));
#if DEBUG_TY >= 100 /* doesn't work as is */
    printf("thull_yap_reduce: checking postcondition\n");
    check_Q_from_lq(R->Q, R->lq);
#endif
}

#define USE_AS_BS_AP_BP 1
#define ALLOCATE_c_d    1
#define ALLOCATE_cp_dp  0 /* cannot work as is */
#define USE_AS_BS_CP_DP 1

/** INPUT: a > b >= 0; a and b of size 2*m;
	   a, b of size an+1; 
           an >= bn; b[bn..an[ is zero;
	   as, bs large enough of size >= an+1.
    OUTPUT: 0 if no reduction was performed (as == a and bs == b), 1 otherwise.
    PRECONDITOIN: R->Q == R->lq = Id
    ACTION:
    Computes [a, b] = (M|Q) [a*, b*] where M == Q and
       (44) a < calA => M = E
     or
       (45) a >= calA => ||a*|| >= TY_bound(a) > ||b*||, a* > b* >= 0.
    We fill R[ind0..ind[
    REM: a and b should *not* be modified.
         It seems we cannot simplify more, since we cannot have c = as, d = bs
	 and at the same time ap = as, bp = bs.
	 => TODO: pass a buffer for (cp, dp)?
    SIDE-EFFECT: as[an], bs[an] will contain sizes;
    POSTCONDITION: R->Q == R->lq

 */
int thull_yap_hgcd(mp_ptr as, mp_ptr bs,
		   regular_t R,
		   mp_ptr a, mp_size_t an,
		   mp_ptr b, mp_size_t bn,
		   int dolastmul, int level)
{
    regular_t R2;
    mp_ptr ap, bp, c, d, cp, dp, tmp, mem;
    mp_size_t apn, bpn, cn, dn, cpn, dpn, tmp_alloc, tmp2n = 0;
    mp_size_t m, t, tp, l, k;
    int status = 1;
#if TIMINGS > 0
    double tot = runtime(), tt;
    ty_level++;
#endif

#if DEBUG_TY >= 1
    thull_yap_enter(a, an, b, bn);
#endif
    if(an > bn)
	assert(b[bn] == 0);
    m = thull_yap_bound(a, an);
    /* a cannot be "small": if a is not small1, then ||a|| >= bound(a) */
    if(mpn_gcd_is_norm_lt(a, an, m) || mpn_gcd_is_norm_lt(b, bn, m)){
        /* ||a|| < H(a) or ||b|| < H(a) */
#if DEBUG_TY >= 1
	printf("{%d} exiting: ||a|| < m: %d ||b|| < m: %d\n", ty_level,
	       mpn_gcd_is_norm_lt(a, an, m), mpn_gcd_is_norm_lt(b, bn, m));
#endif
	MPN_COPY(as, a, an);
	as[an] = an;
	MPN_COPY(bs, b, bn);
	bs[an] = bn;
	status = 0;
	goto end_of_TY_end;
    }
    /* FIXME: overshooting, but for thull_yap_advance */
    tmp_alloc = (2*an+3);
    tmp = (mp_ptr)malloc(tmp_alloc * sizeof(mp_limb_t));
    /* at this point, ||a|| >= ||b|| >= m */
    if(thull_yap_is_small_2(a, an) != 0){
	/* we get to ||a*|| >= m > ||b*|| with quadratic functions */
#if DEBUG_TY >= 1
	printf("{%d} a is small2 => calling gcd_small\n", ty_level);
#endif
	MPN_COPY(as, a, an);
	MPN_COPY(bs, b, bn);
	/* clear bb[bn..an+1[ */
	MPN_ZERO(bs+bn, an-bn+1);
	gcd_small(R, as, an, bs, bn, m, NULL, 0, tmp, tmp_alloc, 0);
	/* FIXME: perhaps reorganize with the labels */
	free(tmp);
	goto end_of_TY_end;
    }
#if DEBUG_TY >= 1
    printf("{%d}{1} a:=", ty_level); MPN_PRINT(a, an);
    printf(";\n{%d}{1} b:=", ty_level); MPN_PRINT(b, bn);
    printf(";\n{%d}{1} m:=%lu;\n", ty_level, m);
#endif
    /* [2] */
    /* a0, b0, b1 of size m; a1 size <= m+1 */
    /* FIXME: can we get rid from malloc's? And use ap = as, bp = bs? */
#if USE_AS_BS_AP_BP == 0
# if 0
    ap = (mp_ptr)malloc((an+1) * sizeof(mp_limb_t)); /* FIXME: overshooting! */
    bp = (mp_ptr)malloc((an+1) * sizeof(mp_limb_t)); /* FIXME: overshooting! */
# else
    /* FIXME: overshooting! */
    mem = (mp_ptr)malloc(((an+1) << 1) * sizeof(mp_limb_t));
    ap = mem;
    bp = mem + (an+1);
# endif
#else
    ap = as; bp = bs;
#endif
#if TIMINGS > 0
    tt = runtime();
#endif
    /* if we arrive here, R was left untouched */
    thull_yap_reduce(ap, bp, R, &t, a, an, b, bn, m, 1,
		     tmp, tmp_alloc, dolastmul, level);
#if DEBUG_TY >= 1
    printf("{%d} R1 has lq[%d..%d[ and n=%ld\n",
	   level, R->lq->first, R->lq->last, R->Q->n);
    check_Q_from_lq(R->Q, R->lq);
#endif
    apn = ap[an];
    bpn = bp[an];
#if TIMINGS > 0
    fprintf(stderr, "{%d} reduce1: m=%lu t=%lu (%lu, %lu) -> (%lu, %lu) %lf\n",
	    ty_level, m, t, an, bn, apn, bpn, runtime()-tt);
#endif
    /* ||a'|| >= m+t > ||b'||; m = an/2, t = an/4 => m+t = 3*an/4 */
    assert(mpn_gcd_is_norm_ge(ap, apn, m+t));
    assert(mpn_gcd_is_norm_lt(bp, bpn, m+t));
    /* a', b' of size (3*m/2) */
#if DEBUG_TY >= 2
    printf("[2] sizes: a': %lu, b': %lu\n", apn, bpn);
#endif
#if DEBUG_TY >= 1
    printf("{%d}end[2]: ap:=", ty_level); MPN_PRINT(ap, apn);
    printf(";\n{%d}end[2]: bp:=", ty_level); MPN_PRINT(bp, bpn);
    printf(";\n");
#endif
    /* [3] */
    if(mpn_gcd_is_norm_lt(bp, bpn, m) != 0){
	/* ||b'|| < m < m+t <= ||a'|| */
#if DEBUG_TY >= 1
	printf("{%d}{3} exits since ||b'|| < m=%lu\n", ty_level, m);
#endif
	if(ap != as)
	    MPN_COPY(as, ap, apn);
	as[an] = apn;
	if(bp != bs)
	    MPN_COPY(bs, bp, bpn);
	bs[an] = bpn;
	goto end_of_TY1;
    }
    /* [4] is in theory
       (q, d) <- (ap div bp, ap mod bp)
       (c, d) <- (bp, ap mod bp)
       we have ap -> bp -> d in Euclid, ||a'|| >= m+t > ||b'|| */
    /* FIXME: could we use c == ap and d == bp??? or better c == bp, d == ap?*/
#if ALLOCATE_c_d == 1
    c   = (mp_ptr)malloc(an * sizeof(mp_limb_t)); /* FIXME: overshooting */
    d   = (mp_ptr)malloc(an * sizeof(mp_limb_t)); /* FIXME: overshooting */
#else
    c = bp; d = ap;
#endif
    MPN_ZERO(bp+bpn, apn-bpn);
#if TIMINGS
    tt = runtime();
#endif
    /* [4] to save memory is
       (tmp, d) <- (ap div bp, ap mod bp) 
       tmp should not be used; try c instead
    */
#if DEBUG_TY >= 100
    printf("before euclidean step\n");
    check_Q_from_lq(R->Q, R->lq);
#endif
    euclidean_step(R->lq, c, d, ap, apn, bp, bpn, tmp, tmp_alloc);
    /* euclidean_step did not update R; we assume R was updated with the
       first part of the computation, isn't it?
     */
    regular_mul_qi(R, R->lq->last-1, tmp, tmp_alloc);
#if DEBUG_TY >= 1
    printf("after euclidean step\n");
    check_Q_from_lq(R->Q, R->lq);
#endif
#if TIMINGS
    fprintf(stderr, "{%d} euclidean step: %lf\n", ty_level, runtime()-tt);
#endif
    /* finishing to perform: (c, d) <- (bp, ap mod bp) */
    /* this could lead to (c, d) == (bs, as) */
#if ALLOCATE_c_d == 1
    MPN_COPY(c, bp, bpn);
#endif
    cn = bpn;
    dn = bpn;
    MPN_NORMALIZE(d, dn);
    /* c, d of size (3*m/2) */
#if DEBUG_TY >= 2
    printf("[4] sizes: c: %lu d: %lu\n", cn, dn);
#endif
#if DEBUG_TY >= 1
    printf("{%d}{4} c:=", ty_level); MPN_PRINT(c, cn);
    printf(";\n{%d}{4} d:=", ty_level); MPN_PRINT(d, dn);
    printf(";\n{%d}{4} m:=%lu;\n", ty_level, m);
#endif
    if(mpn_gcd_is_norm_lt(d, dn, m) != 0){
	/* not in the paper (etiq6) */
	/* ||d|| < m */
#if DEBUG_TY >= 1
	printf("etiq6: ||d|| < m\n");
	printf("etiq6: ||c|| >= m? %d\n", mpn_gcd_is_norm_ge(c, cn, m));
#endif
	assert(mpn_gcd_is_norm_ge(c, cn, m));
	/* beware if we have made some sort of (c, d) == (bs, as) */
	MPN_COPY(as, c, cn);
	as[an] = cn;
	MPN_COPY(bs, d, dn);
	bs[an] = dn;
	goto end_of_TY2;
    }
    thull_yap_split(tmp, &tmp2n, NULL, NULL, c, cn, m);
#if DEBUG_TY >= 1
    printf("{%d}{4} cc:=", ty_level); MPN_PRINT(tmp, tmp2n); printf(";\n");
#endif
    if(mpn_gcd_is_norm_lt(tmp, tmp2n, thull_yap_bound(tmp, tmp2n))){
	/* branch 2: ||c0 = tmp|| < H(c0), hence use jebelean/euclid */
#if DEBUG_TY >= 1
        printf("{%d}{4} cc is small1\n", ty_level);
#endif
	/* return R * Fixup(E, c, d, m, 0); */
	/* actually, this is a terminal append, to we can pass Q directly 
	   and this is valid for all qseq types
	*/
	/* FIXME: replace this with advance, since we don't care about c, d
	   that are temporary variables?
	 */
	/* FIXME: what if (c, d) == (bs, as)? */
	thull_yap_fixup_0(as, bs, R, c, cn, d, dn, m, tmp, tmp_alloc);
	as[an] = as[cn];
	bs[an] = bs[cn];
	goto end_of_TY2;
    }
    /* [5] */
    /* l = ceiling(||c||) */
    l = mpn_gcd_ceiling_norm(c, cn);
    k = 2*m-l-1; /* ||c|| >= m+1 >= 4; claim: ||c||-1 >= k >= 0 */
#if DEBUG_TY >= 1
    printf("{%d}{5} l=%lu k=%lu\n", ty_level, l, k);
#endif
    assert((2*m) >= l+1); /* replaces k >= 0 */
    assert(mpn_gcd_is_norm_ge(c, cn, m+1) && m >= 3);
    assert(mpn_gcd_is_norm_ge(c, cn, k+1));
#if ALLOCATE_cp_dp == 1
# if 0
    cp = (mp_ptr)malloc(an * sizeof(mp_limb_t)); /* FIXME: overshooting */
    dp = (mp_ptr)malloc(an * sizeof(mp_limb_t)); /* FIXME: overshooting */
# else
    /* FIXME: overshooting */
    mem2 = (mp_ptr)malloc((an << 1) * sizeof(mp_limb_t));
    cp = mem2;
    dp = mem2 + an;
# endif
#else
# if USE_AS_BS_CP_DP == 0
    cp = ap; dp = bp; /* used as temp variables */
# else
    cp = as; dp = bs;
# endif
#endif
#if TIMINGS > 0
    tt = runtime();
#endif
    /* FIXME: can we have cp == c and dp == d? If yes, collapse a lot! */
    /* as if R were empty: the idea is to accumulate the 2nd part in Qa;
       and then multiply by Qb (see below) and finally update R. This means
       we are walking on the edge and cannot check R->Q == R->lq during this
       walk
     */
    regular_init(R2, an, an << 1, 1, 1);
    thull_yap_reduce(cp, dp, R2, &tp, c, cn, d, dn, k, 0,
		     tmp, tmp_alloc, dolastmul, level);
#if DEBUG_TY >= 1
    printf("{%d} R2 has lq[%d..%d[ and n=%ld\n",
	   level, R2->lq->first, R2->lq->last, R2->Q->n);
    check_Q_from_lq(R2->Q, R2->lq);
#endif
    cpn = cp[cn];
    dpn = dp[cn];
    /* k = m/2, t' = m/2 => k+t' = m, cn = dn = 3*m/2, cpn = dpn = m */
#if TIMINGS > 0
    fprintf(stderr, "{%d} reduce2: k=%lu t'=%lu (%lu, %lu) "
	    "-> (%lu, %lu) %lf\n",
	    ty_level, k, tp, cn, dn, cpn, dpn, runtime()-tt);
#endif
    /* claim: k+t' = m+1 */
    assert((k+tp) == (m+1));
    /* clear dp[dpn..cpn[ */
    MPN_ZERO(dp+dpn, cpn-dpn);
    /* [7] */
#if DEBUG_TY >= 1
    printf("{%d}{7} cp:=", ty_level); MPN_PRINT(cp, cpn);
    printf(";\n{%d}{7} dp:=", ty_level); MPN_PRINT(dp, dpn);
    printf(";\n");
#endif
    /* assert straddle ||c'|| >= k+t' > ||d'|| */
    assert(mpn_gcd_is_norm_ge(cp, cpn, k+tp));
    assert(mpn_gcd_is_norm_lt(dp, dpn, k+tp));
    /* could we have as = cp and bs = dp below? */
    /* terminal E, so we can use Q instead */
#if STATS
    printf("{%d} #M = %lu, #Qa.1 = %lu\n", level, R->Q->n, Qa->n);
#endif
    /* do we need to use R3 here? */
    thull_yap_fixup_0(as, bs, R2, cp, cpn, dp, dpn, m, tmp, tmp_alloc);
#if STATS
    printf("{%d} #R2->Q = %lu\n", level, R2->Q->n);
#endif
    as[an] = as[cpn];
    bs[an] = bs[cpn];
    /* S <- S*(T=E); return S; */
    /* perhaps not needed if this is the last operation */
    if(level > 0 || dolastmul){
	assert(dolastmul); // FIXME
#if TIMINGS >= 1
	double tt = runtime();
#endif
	regular_omul(R, R2);
#if TIMINGS >= 1
	printf("{%d} omul(%d, %d): %lf\n", ty_level, qseq_card(R->lq),
	       qseq_card(R2->lq), runtime()-tt);
#endif
    }
    regular_clear(R2);
#if DEBUG_TY >= 2
    printf("S*=E="); qseq_print(R->lq); printf(";\n");
#endif
#if ALLOCATE_cp_dp == 1
# if 0
    free(cp); free(dp);
# else
    free(mem2);
# endif
#endif
#if QSEQ_FULL == 0
    /* we have filled Q[Qlast_orig..Q->lq->last[ but we want to condense this
       back to Q[Qlast_orig..Qlast_orig+2[ with only the last two quotients
       to enable a putative backup later in the fixup above.
       FM's comment 2022/02/11: what a crazy idea! Causes a bug in
       recette with imin = 7
    */
# if 0
    printf("Qlast_orig=%d Q->lq->last=%d\n", Qlast_orig, R->lq->last);
    if(R->lq->last - Qlast_orig > 2){
	R->lq->tind[Qlast_orig]   = R->lq->tind[R->lq->last-2];
	R->lq->tind[Qlast_orig+1] = R->lq->tind[R->lq->last-1];
	R->lq->last = Qlast_orig+2;
    }
# endif
#endif
 end_of_TY2:
#if ALLOCATE_c_d
    free(c); free(d);
#endif
 end_of_TY1:
    free(tmp);
#if USE_AS_BS_AP_BP == 0
# if 0
    free(ap); free(bp);
# else
    free(mem);
# endif
#endif
 end_of_TY_end:
#if DEBUG_TY >= 2
    thull_yap_exit(a,an,b,bn,R,as,as[an],bs,bs[an],dolastmul,level);
#endif
#if TIMINGS > 0
    fprintf(stderr, "{%d} TY: %lu %lu %lf\n", ty_level, an, bn, runtime()-tot);
    ty_level--;
    printf("{%d} TY_end: m=%lu Q[%d..%d[ %lf\n",
	   ty_level, m, R->lq->first, R->lq->last, runtime()-tot);
#endif
    return status;
}

/** INPUT: u > v >= 0, un >= vn >= 1.
           tp of size >= 2*un+3
    SIDE-EFFECT: rprev >= rmin > r.
    REM: u and v should not be modified.
*/
void thull_yap_use(mp_ptr rprev, mp_ptr r,
		   mp_ptr u, mp_size_t un, mp_ptr v, mp_size_t vn,
		   mp_ptr rmin, mp_size_t rminn,
		   mp_ptr tp, mp_size_t tp_alloc)
{
    regular_t R;
    mp_ptr as, bs, vp;
    mp_size_t asn, bsn;
    int dolastmul;
#if TIMINGS > 0
    double tt;
#endif
    
#if DEBUG_TY >= 1 || TIMINGS > 0
    ty_level = 0;
#endif
    regular_init(R, un, un << 1, 1, 1); /* TODO: really? */
#if DEBUG_TY >= 1
    printf("u:="); MPN_PRINT(u, un);
    printf(";\nv:="); MPN_PRINT(v, vn);
    printf(";\n");
#endif
    /* FIXME: could we save some of these? */
    as = (mp_ptr)malloc((un+1) * sizeof(mp_limb_t));
    bs = (mp_ptr)malloc((un+1) * sizeof(mp_limb_t));
    vp = (mp_ptr)malloc((un+1) * sizeof(mp_limb_t));
    /* ||as|| >= m > ||bs|| for some m s.t. m >= ||rmin|| */
#if TIMINGS > 0
    tt = runtime();
#endif
    dolastmul = 1; /* R->Q will be false w.r.t. R->lq */
    thull_yap_hgcd(as, bs, R, u, un, v, vn, dolastmul, 0);
#if TIMINGS > 0
    fprintf(stderr, "ThYa: %lu %lu %lf\n", un, vn, runtime()-tt);
#endif
    MPN_COPY(vp, as, (mp_size_t)as[un]);
    MPN_COPY(r,  bs, (mp_size_t)bs[un]);
    asn = (mp_size_t)as[un];
    bsn = (mp_size_t)bs[un];
    MPN_ZERO(r+bsn, asn-bsn);
#if DEBUG_TY >= 1
    printf("Back from thull_yap_hgcd\n");
    printf("  ri:="); MPN_PRINT(vp, asn);
    printf(";\nrmin:="); MPN_PRINT(rmin, rminn);
    printf(";\nrip1:="); MPN_PRINT(r, bsn);
    printf(";\n");
#endif
#if TIMINGS > 0
    tt = runtime();
#endif
    if(dolastmul)
	/* we accumulate in R */
	gcd_small(R, vp, asn, r, bsn, 0, rmin, rminn, tp, tp_alloc, 0);
    else{
	int Qfirst = R->lq->first;

	R->lq->first = R->lq->last;
	/* we reuse R->Q since we do not care. FIXME: really?*/
	hgcd_matrix_set_identity(R->Q);
	gcd_small(R, vp, asn, r, bsn, 0, rmin, rminn, tp, tp_alloc, 0);
	R->lq->first = Qfirst;
    }
#if TIMINGS > 0
    fprintf(stderr, "TYGC: %lu %lu %lf\n", asn, bsn, runtime()-tt);
#endif
    r[un] = r[asn];
#if DEBUG_TY >= 1
    printf("   ri:="); MPN_PRINT(vp, vp[asn]);
    printf(";\nrmin:="); MPN_PRINT(rmin, rminn);
    printf(";\nrip1:="); MPN_PRINT(r, r[un]);
    printf(";\n");
#endif
    /* as >= rmin > bs */
    assert(((mp_size_t)vp[asn] > rminn)
	   || ((mp_size_t)vp[asn] == rminn && mpn_cmp(vp, rmin, rminn) >= 0));
    assert((rminn > (mp_size_t)r[un])
	   || (rminn == (mp_size_t)r[un] && mpn_cmp(rmin, r, rminn) >= 0));
    if(rprev != NULL){
	MPN_COPY(rprev, vp, (mp_size_t)vp[asn]);
	rprev[un] = vp[asn];
    }
    free(as);
    free(bs);
    free(vp);
    regular_clear(R);
}

/* INPUT: a > b.
   OUTPUT: gn.
   SIDE-EFFCT: g[0..gn[ <- gcd(a, b).
   POSTCONDITION: R->Q == R->lq
 */
mp_size_t thull_yap_gcd(regular_t R, mp_ptr g, mp_ptr a, mp_size_t an,
			mp_ptr b, mp_size_t bn, int dolastmul)
{
    regular_t R2;
    mp_ptr ri, rip1, u, v, tp, tmp;
    mp_limb_t rmin[1];
    mp_size_t un, vn, n = an, tp_alloc;
    int ok = 1, nsteps = 0;
#if TIMINGS > 0
    fprintf(stderr, "TYGCD.enter: %lu %lu (%d)\n", an, bn, dolastmul);
#endif
    /* FIXME: could not we use 2 buffers instead of 4? */
    u     = (mp_ptr)malloc((n+1) * sizeof(mp_limb_t));
    v     = (mp_ptr)malloc((n+1) * sizeof(mp_limb_t));
    ri    = (mp_ptr)malloc((n+1) * sizeof(mp_limb_t));
    rip1  = (mp_ptr)malloc((n+1) * sizeof(mp_limb_t));
    tp_alloc = (2*n+3);
    tp    = (mp_ptr)malloc(tp_alloc * sizeof(mp_limb_t));
    MPN_COPY(u, a, an);
    MPN_COPY(v, b, bn);
    un = an;
    vn = bn;
#if TIMINGS > 0
    double tt = runtime();
#endif
    while(1){
#if TIMINGS > 0
	fprintf(stderr, "TYGCD_%d: %lf\n", nsteps, runtime()-tt);
	tt = runtime();
#endif	
#if DEBUG_TY >= 1
	printf("u:="); MPN_PRINT(u, un); printf(";\n");
	printf("v:="); MPN_PRINT(v, vn); printf(";\n");
#endif
	u[un] = 0;
	v[vn] = 0;
	if(nsteps == 0)
	    ok = thull_yap_hgcd(ri, rip1, R, u, un, v, vn, dolastmul, 0);
	else{
	    /* FIXME: a bit costly right now */
	    regular_init(R2, un, un << 1, 1, 1);
	    ok = thull_yap_hgcd(ri, rip1, R2, u, un, v, vn, dolastmul, 0);
	    regular_omul(R, R2);
	    regular_clear(R2);
	}
#if DEBUG_TY >= 1
	printf("check %d: last=%d\n", nsteps, R->lq->last);
	check_Q_from_lq(R->Q, R->lq);
#endif
	if(ok == 0)
	    break;
	tmp = ri; ri = u; u = tmp;
	tmp = rip1; rip1 = v; v = tmp;
	vn = v[un];
	un = u[un];
	nsteps++;
    }
    /* we want u >= rmin > v, and typically u = gcd >= 1 > 0 */
    rmin[0] = 1;
    gcd_small(R, u, un, v, vn, 0, rmin, (mp_size_t)1, tp, tp_alloc, 0);
#if DEBUG_TY >= 1
    check_Q_from_lq(R->Q, R->lq);
#endif
    /* not needed: qseq_append(R, RR); */
    un = u[un];
    MPN_COPY(g, u, un);
    free(ri);
    free(rip1);
    free(u);
    free(v);
    free(tp);
    return un;
}
