/*		PROCESS EXAMPLES --- prepare for generate T and later cross validation
1. sampling -- ordered
2. assign fold
3. seperate pos and neg within sampled

each fold--separate TrainTest from Pos and Neg

4. Train-> FinalT-> Test on TestData

5. separate TestFold for Cross Validation
*/


/* random
the random number generator in yap is Ok, 
everytime it's launched, it gives the same number, but after launched, it is OK, give different random number
I used randset to get a list, rather than that in Toplog -- assign random number, later random< ... (difficult to control the size of sampled--not necessarily that uniform)

random(+LOW, +HIGH, -NUMBER) gives a interger [Low,High)=[Low,High-1]
*/

reorderEx_byLength(EIs0,EIs):-
maplist(assign_lengthLabel,EIs0,LengthLabeledEIs),
	sort(LengthLabeledEIs,SortedLengthLabeledEIs),
	maplist(removeLengthLabel_addProveSign,SortedLengthLabeledEIs,EIs).

assign_lengthLabel(EI,Length-EI):-
	ex(EI,parse(Seq),PosNegSign),
	length(Seq,Length).
removeLengthLabel_addProveSign(Length-EI,EI).


% input is the numberlist

scriptExGenerator(TimePoint,StartIndex,EndIndex):-
	numbersList(StartIndex,EndIndex,AllEIs),
	get_folds(AllEIs,AllFolds),
	scriptExGenerators(AllFolds,AllEIs,TimePoint,1).

scriptExGenerators([],AllEIs,TimePoint,Index).
scriptExGenerators([OneFold|AllFolds],AllEIs,TimePoint,Index):-
	OneFold=TestEIs,
	atomic_concat([TimePoint,'/',test,Index,'.pl'],TestEx_FileName),
	tell(TestEx_FileName),
	portray_clause(test_examples(TestEIs)),
	told,

	ord_subtract(AllEIs,TestEIs,TrainingEIs),
	atomic_concat([TimePoint,'/',train,Index,'.pl'],TrainEx_FileName),
	tell(TrainEx_FileName),
	portray_clause(training_examples(TrainingEIs)),
	portray_clause(test_examples(TestEIs)), % this is for removing the test data
	told,
  	NextIndex is Index+1,
	scriptExGenerators(AllFolds,AllEIs,TimePoint,NextIndex).


%%%%%%%%%%%%%%%%%%%%%%%% 
/* 
given: Total Data Size,
	Sample Size,  (E-Random < SampleSize/TotalSize)
	Number of Folds
Output: [{ID,Fold}|List]

% even not positive only, in case of small size of negative example, negE may not be sampled, so NEs still maybe []
*/
exProcess(TotalDataSize,SampleSize,NumFolds,PEs,NEs):-
	randset(SampleSize,TotalDataSize,SampledEs), % no need to+1, since Max is within the range 
	FoldAssignRange is NumFolds+1,
	assignFold(FoldAssignRange,SampledEs,Es),
	(set(posOnly,yes)->	%TotalDataSize==PosSize, but set(posOnly,yes) will be used later, and more intutive
		PEs=Es,NEs=[];
		set(posDataSize,PosSize),
		sepPN(PosSize,Es,PEs,NEs)
	),
	length(PEs,NumPE), write(NumPE),write(' Positive Examples are sampled, they are '),write(PEs),nl,printEfs(PEs),
	length(NEs,NumNE), write(NumNE),write(' Negative Examples are sampled, they are '),write(NEs),nl,printEfs(NEs).


printEfs([]).
printEfs([EI-Fold|EIFs]):-
	ex(EI,E,_),
	write(E),nl,
	printEfs(EIFs).

assignFold(Max,[],[]).
assignFold(Max,[E|Es],[E-Fold|FoldEs]):-
	assignFold(Max,Es,FoldEs),
	random(1, Max, Fold).	% Fold is a number between 0 ~ NumFolds-1
	
	
% Es is ordered, so once encounter the Example with Index bigger than Positive Size, that is the start of negative example
sepPN(PosSize,[EI-Fold|NEs],[],[EI-Fold|NEs]):-
	EI>PosSize, !.
sepPN(PosSize,[E|Es],[E|PEs],NEs):-
	sepPN(PosSize,Es,PEs,NEs).
	

sepPN_ID([],[],[]).
sepPN_ID([EI|Data],[EI|PE],NE):-
	ex(EI,E,1),!,
	sepPN_ID(Data,PE,NE).
sepPN_ID([EI|Data],PE,[EI|NE]):-
	sepPN_ID(Data,PE,NE).

/*
sepPN_ID(PosSize,[EI|NEs],[],[EI|NEs]):-
	EI>PosSize, !.
sepPN_ID(PosSize,[E|Es],[E|PEs],NEs):-
	sepPN_ID(PosSize,Es,PEs,NEs).
*/

sepTestFold(FoldID,[],[],[]).
sepTestFold(FoldID,[E-Fold|FoldedEs],[E|TestEs],TrainEs):-
	Fold==FoldID,!,
	sepTestFold(FoldID,FoldedEs,TestEs,TrainEs).
sepTestFold(FoldID,[E-Fold|FoldedEs],TestEs,[E|TrainEs]):-
	sepTestFold(FoldID,FoldedEs,TestEs,TrainEs).