/*
 * The contents of this file are subject to the MonetDB Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.monetdb.org/Legal/MonetDBLicense
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
 * License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is the MonetDB Database System.
 *
 * The Initial Developer of the Original Code is CWI.
 * Portions created by CWI are Copyright (C) 1997-July 2008 CWI.
 * Copyright August 2008-2015 MonetDB B.V.
 * All Rights Reserved.
 */

/*
 * @f gdk_setop
 *
 */
/*
 * @a Peter Boncz
 *
 * @* Set Operations
 * Set operations are provided in two series:
 * @itemize
 * @item
 * k-@emph{operand}, which look only at the head column.
 * @item
 * s-@emph{operand} series, that look at the whole BUN.
 * @end itemize
 *
 * Operands provided are:
 * @itemize
 * @item kunion
 * produces a bat union.
 * @item kdiff
 * produces bat difference.
 * @item kintersection
 * produce bat intersection.
 * @end itemize
 *
 * Implementations typically take two forms: if the input relation(s)
 * is/are ordered, a merge-algorithm is used. Otherwise, hash-indices
 * are produced on demand for the hash-based versions.
 *
 * The @emph{kintersect(l,r)} operations result in all BUNs of
 * @emph{l} that are also in @emph{r}. They do not do
 * double-elimination over the @emph{l} BUNs.
 *
 * The @emph{kdiff(l,r)} operations result in all BUNs of @emph{l}
 * that are not in @emph{r}. They do not do double-elimination over
 * the @emph{l} BUNs.
 *
 * The @emph{kunion(l,r)} operations result in all BUNs of
 * @emph{l}, plus all BUNs of @emph{r} that are not in @emph{l}. They
 * do not do double-elimination over the @emph{l} nor @emph{r} BUNs.
 *
 * The @emph{kintersect(l,r)} is used also as implementation for the
 * @emph{semijoin()}.
 */

#include "monetdb_config.h"
#include "gdk.h"
#include "gdk_private.h"
#include "gdk_search.h"

#define HITintersect(h,t)       bunfastins(bn,h,t)
#define HITdiff(h,t)
#define MISSintersect(h,t)
#define MISSdiff(h,t)           bunfastins(bn,h,t)

#define HITintersect_nocheck(h,t)       bunfastins_nocheck(bn,BUNlast(bn),h,t,Hsize(bn),Tsize(bn))
#define HITdiff_nocheck(h,t)
#define MISSintersect_nocheck(h,t)
#define MISSdiff_nocheck(h,t)           bunfastins_nocheck(bn,BUNlast(bn),h,t,Hsize(bn),Tsize(bn))

#define DHITintersect(h,t)       bnh[o] = *(h); bnt[o++] = t;
#define DHITdiff(h,t)
#define DMISSintersect(h,t)
#define DMISSdiff(h,t)           bnh[o] = *(h); bnt[o++] = t;

#define ENDintersect(h,t)
#define ENDdiff(h,t)            for(;p1<q1;p1++) bunfastins(bn,h,t)

/*
 * @+ Double Elimination
 * Comes in two flavors: looking at one column, or at two at-a-time.
 * Implementation is either merge- or hash-based.
 */

/*
 * @+ Difference and Intersect
 * Difference and Intersection are handled together.  BATkdiff(l,r)
 * and BATkintersect(l,r)
 */
#define mergecheck(a1,a2,a3,a4)						\
	do {								\
		BUN p1 = BUNfirst(l), p2 = BUNfirst(r);			\
		BUN q1 = BUNlast(l),  q2 = BUNlast(r);			\
		BATiter li = bat_iterator(l);				\
		BATiter ri = bat_iterator(r);				\
									\
		ALGODEBUG fprintf(stderr,				\
				  "#BATins_%s%s: mergecheck[%s, %s, %s, %s, k];\n", \
				  #a1, #a2, #a1, #a2, #a3, #a4);	\
		if (p2 < q2)						\
			BATloop(l, p1, q1) {				\
				ptr  h = BUNh##a2(li, p1);		\
				ptr  t = BUNtail(li, p1);		\
				ptr h2 = BUNh##a2(ri, p2);		\
				int c;					\
				while ((c = a4) > 0) {			\
					if ((++p2) >= q2)		\
						goto end##a2##a3;	\
					h2 = BUNh##a2(ri, p2);		\
				}					\
				if (c == 0) {				\
					h2 = hnil;			\
					if (a4) { /* check for not-nil (nils don't match anyway) */ \
						HIT##a1(h, t);		\
						continue;		\
					}				\
				}					\
				MISS##a1(h, t);				\
			}						\
	  end##a2##a3:;							\
		END##a1(BUNh##a2(li, p1), BUNtail(li, p1));		\
	} while (0)

#define hashcheck(a1,a2,a3,a4,a5)					\
	do {								\
		BUN p1, q1;						\
		int ins;						\
		BUN s2;							\
		ptr h, t, h2 = hnil;					\
		BATiter li = bat_iterator(l);				\
		BATiter ri = bat_iterator(r);				\
									\
		ALGODEBUG fprintf(stderr, "#BATins_%s%s: hashcheck[%s, %s, %s, %s, k];\n", #a1, #a2, #a1, #a2, #a3, #a4); \
		if (BATprepareHash(r)) {				\
			goto bunins_failed;				\
		}							\
		BATloop(l, p1, q1) {					\
			h = BUNh##a2(li, p1);				\
			t = BUNtail(li, p1);				\
			ins = TRUE;					\
			if (a5) /* check for not-nil (nils don't match anyway) */ \
				HASHloop##a4(ri, r->H->hash, s2, h) {	\
					HIT##a1(h, t);			\
					ins = FALSE;			\
					break;				\
				}					\
			if (!ins)					\
				continue;				\
			MISS##a1(h, t);					\
		}							\
		(void)h2; /* in some cases the a5 check doesn't use the h2 */ \
	} while (0)

#define DIRECT_MAX 256

#define bte_EQ(x,y) simple_EQ(x,y,bte)
#define sht_EQ(x,y) simple_EQ(x,y,sht)
#define int_EQ(x,y) simple_EQ(x,y,int)
#define lng_EQ(x,y) simple_EQ(x,y,lng)
#define flt_EQ(x,y) simple_EQ(x,y,flt)
#define dbl_EQ(x,y) simple_EQ(x,y,dbl)

/* later add version for l void tail, remove general tail values then */
#define directcheck(a1,a2,a3,a4,a5,a6)					\
	do {								\
		BUN p1, q1;						\
		int i;							\
		ptr h, h2 = hnil;					\
		BATiter li = bat_iterator(l);				\
		BATiter ri = bat_iterator(r);				\
		sht d[DIRECT_MAX];					\
		Hash hs, *H = &hs;					\
		int collision = 0;					\
									\
		H -> mask = DIRECT_MAX-1;				\
		H -> type = BAThtype(l);				\
									\
		ALGODEBUG fprintf(stderr, "#BATins_%s%s: directcheck[%s, %s, %s, %s, k];\n", #a1, #a2, #a1, #a2, #a3, #a4); \
									\
		assert(l->htype == r->htype && r->htype != TYPE_void);	\
									\
		memset(d, 0, sizeof(d));				\
		BATloop(r, p1, q1) {					\
			h = BUNh##a2(ri,p1);				\
			i = (int) hash_##a4(H, h);			\
			/* collision or check for not-nil (nils don't match anyway) */ \
			if (d[i] != 0 || !(a5)) {			\
				collision = 1;				\
				break;					\
			}						\
			d[i] = ((sht)p1)+1;				\
		}							\
		if (collision) {					\
			hashcheck(a1,a2,a3,_##a4,a5);			\
		} else {						\
			if (!l->ttype && l->tseqbase != oid_nil) {	\
				oid b = l->tseqbase, *t = &b;		\
				a4 *h = (a4*)BUNhloc(li, BUNfirst(l));	\
				a4 *rh = (a4*)BUNhloc(ri, 0);		\
				a4 *bnh;				\
				oid *bnt;				\
				BUN o = BUNfirst(bn);			\
									\
				ALGODEBUG fprintf(stderr, "#BATins_%s%s: directcheck[%s, %s, %s, _%s, k][void tail]; " BUNFMT " " BUNFMT "\n", #a1, #a2, #a1, #a2, #a3, #a4, BATcount(l), BATcount(r)); \
				p1 = 0;					\
				q1 = BATcount(l);			\
				while(p1 < q1) {			\
					BUN r1;				\
					if (p1 + 1 > BATcapacity(bn)){	\
						BATsetcount(bn, o);	\
						if (BATextend(bn, BATgrows(bn)) == NULL) \
							goto bunins_failed; \
					}				\
					r1 = p1 + BATcapacity(bn) - BUNlast(bn); \
					if (r1 > q1) r1 = q1;		\
					bnh = (a4*)Hloc(bn,0);		\
					bnt = (oid*)Tloc(bn,0);		\
					for (; p1<r1; p1++, b++){	\
						i = (int) hash_##a4(H, h+p1); \
						if (d[i] != 0 && a6(h+p1, rh+d[i]-1)) { \
							DHIT##a1(h+p1, b); \
						} else {		\
							DMISS##a1(h+p1, b); \
						}			\
					}				\
				}					\
				BATsetcount(bn, o);			\
				(void)t;				\
			} else {					\
				a4 *h = (a4*)BUNhloc(li, 0);		\
				a4 *rh = (a4*)BUNhloc(ri, 0);		\
									\
				ALGODEBUG fprintf(stderr, "#BATins_%s%s: directcheck[%s, %s, %s, _%s, k]; " BUNFMT " " BUNFMT "\n", #a1, #a2, #a1, #a2, #a3, #a4, BATcount(l), BATcount(r)); \
				p1 = BUNfirst(l);			\
				q1 = BUNlast(l);			\
				while(p1 < q1) {			\
					BUN r1;				\
					if (BUNlast(bn) + 1 > BATcapacity(bn)){	\
						if (BATextend(bn, BATcapacity(bn)+65536) == NULL) \
							goto bunins_failed; \
					}				\
					r1 = p1 + BATcapacity(bn) - BUNlast(bn); \
					if (r1 > q1) r1 = q1;		\
					for (; p1<r1; p1++) {		\
						i = (int) hash_##a4(H, h+p1); \
						if (d[i] != 0 && a6(h+p1, rh+d[i]-1)) { \
							HIT##a1##_nocheck(h+p1, BUNtail(li, p1)); \
						} else {		\
							MISS##a1##_nocheck(h+p1, BUNtail(li, p1)); \
						}			\
					}				\
				}					\
			}						\
		}							\
		(void)h2; /* in some cases the a5 check doesn't use the h2 */ \
	} while (0)

#define checkall(a1,a2,a3,a4)						\
	do {								\
		if (BAThdense(l)) {					\
			hashcheck(a1,pos,a2,a3,TRUE);			\
		} else if (hash) {					\
			if (l->htype == TYPE_str && l->H->vheap->hashash) { \
				hashcheck(a1,a2,a2,_str_hv,a4);		\
			} else {					\
				hashcheck(a1,a2,a2,a3,a4);		\
			}						\
		} else {						\
			mergecheck(a1,a2,a3,a4);			\
		}							\
	} while (0)

#define check(a1,a2,a3,a4,a5)					\
	do {							\
		if (BAThdense(l)) {				\
			hashcheck(a1,pos,a2,_##a3,TRUE);	\
		} else if (hash) {				\
			if (BATcount(r) < DIRECT_MAX) {		\
				directcheck(a1,a2,a2,a3,a4,a5);	\
			} else {				\
				hashcheck(a1,a2,a2,_##a3,a4);	\
			}					\
		} else {					\
			mergecheck(a1,a2,_##a3,a4);		\
		}						\
	} while (0)

#define batcheck(a1)							\
static BAT*								\
BATins_k##a1(BAT *bn, BAT *l, BAT *r)					\
{									\
	int hash = TRUE, (*cmp)(const void *, const void *), (*merge)(const void *, const void *) = NULL; \
	ptr hnil, tnil;							\
	BAT *b = bn;							\
									\
	/* determine how to do the intersect */				\
	if (BAThordered(l) & BAThordered(r)) {				\
		hash = FALSE;						\
	}								\
									\
	merge = BATatoms[l->htype].atomCmp;				\
	cmp = BATatoms[l->ttype].atomCmp;				\
	hnil = ATOMnilptr(l->htype);					\
	tnil = ATOMnilptr(l->ttype);					\
	(void) cmp;							\
	(void) tnil;							\
	(void) hnil;							\
									\
	if (BAThdense(r)) {						\
		/* voidcheck */						\
		BATiter li = bat_iterator(l);				\
		BATiter ri = bat_iterator(r);				\
		BUN p1 = BUNfirst(r), q1 = BUNlast(r);			\
		oid rl = * (oid *) BUNhead(ri, p1);			\
		oid rh = rl + BATcount(r);				\
		ptr h, t = NULL, t2 = NULL;				\
									\
		(void) t2;						\
									\
		ALGODEBUG fprintf(stderr,				\
				  "#BATins_k%s: voidcheck[k, %s];\n",	\
				  #a1, #a1);				\
		if (BAThdense(l)) {					\
			oid ll = * (oid *) BUNhead(li, (p1 = BUNfirst(l))); \
			oid lh = ll + BATcount(l);			\
			BUN hit_start = (q1 = BUNlast(l)), hit_end = q1, w = BUNfirst(r); \
			BUN off = p1;					\
									\
			h = (ptr) &ll;					\
									\
			if (rl >= ll && rl < lh) {			\
				hit_start = off + (rl - ll);		\
			} else if (rl < ll && rh > ll) {		\
				hit_start = p1;				\
				w += (ll - rl);				\
			}						\
			if (rh >= ll && rh < lh) {			\
				hit_end = off + (rh - ll);		\
			}						\
			while(p1 < hit_start) {				\
				t = BUNtail(li, p1);			\
				MISS##a1(h, t);				\
				ll++;					\
				p1++;					\
			}						\
			while(p1 < hit_end) {				\
				t = BUNtail(li, p1);			\
				t2 = BUNtail(ri, w);			\
				HIT##a1(h, t);				\
				ll++;					\
				p1++;					\
				w++;					\
			}						\
			while (p1 < q1) {				\
				t = BUNtail(li, p1);			\
				MISS##a1(h, t);				\
				ll++;					\
				p1++;					\
			}						\
		} else {						\
			BUN off = p1;					\
									\
			BATloop(l, p1, q1) {				\
				oid o = * (oid *) BUNhloc(li, p1);	\
									\
				h = (ptr) &o;				\
				t = BUNtail(li, p1);			\
									\
				if (o >= rl && o < rh) {		\
					BUN w = off + (o - rl);		\
									\
					t2 = BUNtail(ri, w);		\
					HIT##a1(h, t);			\
					continue;			\
				}					\
				MISS##a1(h, t);				\
			}						\
		}							\
	} else {							\
		int tpe = ATOMtype(r->htype);				\
		if (tpe != ATOMstorage(tpe) &&				\
		    ATOMnilptr(ATOMstorage(tpe)) == ATOMnilptr(tpe) &&	\
		    BATatoms[ATOMstorage(tpe)].atomCmp == BATatoms[tpe].atomCmp) \
			tpe = ATOMstorage(tpe);				\
		switch(tpe) {						\
		case TYPE_bte:						\
			check(a1,loc,bte,simple_CMP(h,h2,bte),bte_EQ);	\
			break;						\
		case TYPE_sht:						\
			check(a1,loc,sht,simple_CMP(h,h2,sht),sht_EQ);	\
			break;						\
		case TYPE_int:						\
			check(a1,loc,int,simple_CMP(h,h2,int),int_EQ);	\
			break;						\
		case TYPE_flt:						\
			check(a1,loc,flt,simple_CMP(h,h2,flt),flt_EQ);	\
			break;						\
		case TYPE_dbl:						\
			check(a1,loc,dbl,simple_CMP(h,h2,dbl),dbl_EQ);	\
			break;						\
		case TYPE_lng:						\
			check(a1,loc,lng,simple_CMP(h,h2,lng),lng_EQ);	\
			break;						\
		default:						\
			if (r->hvarsized) {				\
				checkall(a1,var,var,((*merge)(h,h2)));	\
			} else {					\
				checkall(a1,loc,loc,((*merge)(h,h2)));	\
			}						\
			break;						\
		}							\
	}								\
	return b;							\
  bunins_failed:							\
	BBPreclaim(b);							\
	return NULL;							\
}

batcheck(intersect)
batcheck(diff)


/*
 * The routine BATclone creates a bat with the same types as b.
 */
static BAT *
BATclone(BAT *b, BUN cap, int role)
{
	BAT *c = BATnew(b->htype, b->ttype, cap, role);

	if (c) {
		if (c->htype == TYPE_void && b->hseqbase != oid_nil)
			BATseqbase(c, b->hseqbase);
		if (c->ttype == TYPE_void && b->tseqbase != oid_nil)
			BATseqbase(BATmirror(c), b->tseqbase);
	}
	return c;
}

static BAT *
diff_intersect(BAT *l, BAT *r, int diff)
{
	BUN smaller;
	BAT *bn;

	ERRORcheck(l == NULL, "diff_intersect: left is null");
	ERRORcheck(r == NULL, "diff_intersect: right is null");
	ERRORcheck(TYPEerror(BAThtype(l), BAThtype(r)), "diff_intersect: incompatible head-types");

	if (BATcount(r) == 0) {
		return diff ? BATcopy(l, l->htype, l->ttype, FALSE, TRANSIENT) : BATclone(l, 10, TRANSIENT);
	} else if (BATcount(l) == 0) {
		return BATclone(l, 10, TRANSIENT);
	}
	smaller = BATcount(l);
	if (!diff && BATcount(r) < smaller)
		smaller = BATcount(r);
	bn = BATnew(BAThtype(l), BATttype(l), MAX(smaller,BATTINY), TRANSIENT);
	if (bn == NULL)
		return NULL;

	/* fill result bat bn */
	if (diff) {
		ALGODEBUG fprintf(stderr, "#diff_intersect: BATins_kdiff(bn, l, r);\n");
		bn = BATins_kdiff(bn, l, r);
	} else {
		ALGODEBUG fprintf(stderr, "#diff_intersect: BATins_kintersect(bn, l, r);\n");
		bn = BATins_kintersect(bn, l, r);
	}
	if (bn == NULL)
		return NULL;

	/* propagate alignment info */
	if (BATcount(bn) == BATcount(l)) {
		ALIGNset(bn, l);
	}
	if (!diff &&
	    BAThordered(l) & BAThordered(r) &&
	    l->hkey &&
	    BATcount(bn) == BATcount(r)) {
		ALIGNsetH(bn, r);
	}
	if (BATcount(bn) <= 1) {
		bn->hsorted = 1;
		bn->hrevsorted = 1;
		bn->tsorted = 1;
		bn->trevsorted = 1;
		BATkey(bn, TRUE);
		BATkey(BATmirror(bn), TRUE);
	} else {
		bn->hsorted = BAThordered(l);
		bn->hrevsorted = BAThrevordered(l);
		bn->tsorted = BATtordered(l);
		bn->trevsorted = BATtrevordered(l);
		BATkey(bn, BAThkey(l));
		BATkey(BATmirror(bn), BATtkey(l));
	}
	bn->H->nonil = l->H->nonil;
	bn->T->nonil = l->T->nonil;
	return bn;
}

BAT *
BATkdiff(BAT *l, BAT *r)
{
	return diff_intersect(l, r, 1);
}

BAT *
BATkintersect(BAT *l, BAT *r)
{
	return diff_intersect(l, r, 0);
}

/*
 * @+ Union
 * Union consists of one version: BATkunion(l,r), which unites
 * with double elimination over the head column only. The
 * implementation uses the kdiff() code for
 * efficient double elimination.
 */
BAT *
BATkunion(BAT *l, BAT *r)
{
	int hdisjunct, tdisjunct;
	BAT *bn, *b;
	BUN p,q;
	BATiter li, ri;
	int ht, tt;

	BATcompatible(l, r);
	if (BATcount(l) == 0) {
		b = l;
		l = r;
		r = b;
	}
	if (BATcount(r) == 0) {
		return BATcopy(l, l->htype, l->ttype, FALSE, TRANSIENT);
	}

	b = NULL;
	li = bat_iterator(l);
 	ri = bat_iterator(r);
	hdisjunct = BAThordered(r) & BAThordered(l) &&
		    ATOMcmp(l->htype, BUNhead(li, BUNlast(l) - 1), BUNhead(ri, BUNfirst(r))) < 0;
	tdisjunct = BATtordered(r) & BATtordered(l) &&
		    ATOMcmp(l->ttype, BUNtail(li, BUNlast(l) - 1), BUNtail(ri, BUNfirst(r))) < 0;

	if (!hdisjunct) {
		b = r;
		ri.b = r = BATkdiff(r, l);
		if (r == NULL) {
			return NULL;
		}
	}

	if (BATcount(r) == 0) {
		if (b)
			BBPreclaim(r);
		return BATcopy(l, l->htype, l->ttype, FALSE, TRANSIENT);
	}

	ht = l->htype;
	tt = l->ttype;
	if (ht == TYPE_void && l->hseqbase != oid_nil)
		ht = TYPE_oid;
	if (tt == TYPE_void && l->tseqbase != oid_nil)
		tt = TYPE_oid;
	bn = BATcopy(l, ht, tt, TRUE, TRANSIENT);
	if (bn == NULL) {
		if (b)
			BBPreclaim(r);
		return NULL;
	}
	BATloop(r, p, q) {
		bunfastins(bn, BUNhead(ri, p), BUNtail(ri, p));
	}
	if (!BAThdense(l) || !BAThdense(r) ||
	    * (oid *) BUNhead(li, BUNlast(l) - 1) + 1 != * (oid *) BUNhead(ri, BUNfirst(r))) {
		bn->hseqbase = oid_nil;
		bn->hdense = 0;
	}
	if (!BATtdense(l) || !BATtdense(r) ||
	    * (oid *) BUNtail(li, BUNlast(l) - 1) + 1 != * (oid *) BUNtail(ri, BUNfirst(r))) {
		bn->tseqbase = oid_nil;
		bn->tdense = 0;
	}
	bn->H->nonil = l->H->nonil & r->H->nonil;
	bn->T->nonil = l->T->nonil & r->T->nonil;
	bn->H->nil = l->H->nil | r->H->nil;
	bn->T->nil = l->T->nil | r->T->nil;
	if (b) {
		BBPreclaim(r);
		r = b;
	}
	HASHdestroy(bn);

	bn->hsorted = hdisjunct;
	bn->hrevsorted = 0;
	bn->tsorted = tdisjunct;
	bn->trevsorted = 0;
	bn->talign = bn->halign = 0;
	if (!r->hkey)
		BATkey(bn, FALSE);
	BATkey(BATmirror(bn), tdisjunct && BATtkey(l) && BATtkey(r));

	return bn;
  bunins_failed:
	BBPreclaim(bn);
	if (b)
		BBPreclaim(r);
	return NULL;
}
