/**********************************************************************
 *
 * phrase.cpp -- implementation of the phrase object used by suffix.cpp
 *
 * Copyright 2000 Gordon W. Paynter
 * Copyright 2000 The New Zealand Digital Library Project
 *
 * A component of the Greenstone digital library software
 * from the New Zealand Digital Library Project at the
 * University of Waikato, New Zealand.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 *********************************************************************/

#include <assert.h>
#include <stdio.h>

#if defined(GSDL_USE_STL_H)
#  include <vector.h>
#else
#  include <vector>
#endif

#if defined(GSDL_USE_IOS_H)
#  include <iostream.h>
#else
#  include <iostream>
#endif

#include "suffix.h"
#include "phrase.h"


// Phrase constructor functions

Phrase::Phrase(symbol *words, cellcount size, int direction) {
  empty();
  length = size;

  if (direction == SUFFIX) {
     forward = words;
     back = forward + size - 1;
  } else {
    back = words;
    forward = back - size + 1;
  }
}


Phrase::Phrase() {
  empty();
}


Phrase::Phrase(const Phrase &p) {
  forward = p.forward;
  back = p.back;
  length = p.length;

  suffixFound = p.suffixFound;
  prefixFound = p.prefixFound;

  firstSuffix = p.firstSuffix;
  lastSuffix  = p.lastSuffix;
  firstSuffixIndex = p.firstSuffixIndex;
  lastSuffixIndex  = p.lastSuffixIndex;
  suffixFrequency  = p.suffixFrequency;

  firstPrefix = p.firstPrefix;
  lastPrefix  = p.lastPrefix;
  firstPrefixIndex = p.firstPrefixIndex;
  lastPrefixIndex  = p.lastPrefixIndex;
  prefixFrequency  = p.prefixFrequency;

  uniqueSuffixExtension = p.uniqueSuffixExtension;
  uniquePrefixExtension = p.uniquePrefixExtension;
}


// Empty the contents of a phrase

int Phrase::empty() {

  forward = back = NULL;
  length = 0;

  suffixFound = prefixFound = 0;
  firstSuffix = firstPrefix = NULL;
  lastSuffix = lastPrefix = NULL;

  firstSuffixIndex = lastSuffixIndex = suffixFrequency = 0;
  firstPrefixIndex = lastPrefixIndex = prefixFrequency = 0;

  uniqueSuffixExtension = uniquePrefixExtension = -1;

  return 0;
}

// Remove the details of the location where a suffix/prefix
// is found in the array.

int Phrase::clearSuffix() {
  suffixFound = 0;
  firstSuffix = lastSuffix = NULL;
  firstSuffixIndex = lastSuffixIndex = suffixFrequency = 0;
  uniqueSuffixExtension = -1;
  return 0;
}

int Phrase::clearPrefix() {
  prefixFound = 0;
  firstPrefix = lastPrefix = NULL;
  firstPrefixIndex = lastPrefixIndex = prefixFrequency = 0;
  uniquePrefixExtension = -1;
  return 0;
}

// Increase the length of s phrase
//
// We increase the length of a string "in place".  When we expand
// the suffix, however, we invalidate the prefix data, and vice-versa.

int Phrase::increaseSuffixLength(cellcount l) {
  length = l;
  clearPrefix();
  back = forward + l - 1;
  return 0;
}

int Phrase::increasePrefixLength(cellcount l) {
  length = l;
  clearSuffix();
  forward = back - l + 1;
  return 0;
}

// Shorten a phrase by one symbol
int Phrase::shortenByOneAtSuffix() {
  --length;
  --back;
  if (phraseMode==STOPWORDS) {
    while (*back >=firstStopSymbol && *back <= lastStopSymbol) {
      --length;
      --back;
    }
  }  
  clearSuffix();
  clearPrefix();
  return 0;
}

int Phrase::shortenByOneAtPrefix() {
  --length;
  ++forward;
  if (phraseMode==STOPWORDS) {
    while (*forward >=firstStopSymbol && *forward <= lastStopSymbol) {
      --length;
      ++forward;
    }
  }
  clearSuffix();
  clearPrefix();
  return 0;
}


// Output a phrase to a stream
ostream &operator<<(ostream &stream, const Phrase &phrase)
{
  assert(phrase.forward);
  symbol *s = phrase.forward;

  stream << "s" << *s++;
  for (cellcount i = 1; i < phrase.length; ++i)
    stream << " s" << *s++;

  return stream;
}


// Convert the phrase to a string
// Note thgat you have to delete the memory yourself.
char *Phrase::toString() 
{
  assert(forward);

  char *str;
  str = new char[length*20];
  symbol *s = forward;
  sprintf(str, "s%d", *s++);

  for (cellcount i = 1; i < length; ++i) {
    sprintf(str, "%s s%d", str, *s++);
  }

  return str;
}


// Compare a phrase to an another array of symbols
//
// Given an array of words with a specified length, 
// compare those words to this phrase.
//
// Return 0 if the two phrases are the same over length length
// Return 1 if the phrase is "greater" than tyhe words, or
// Return -1 if the phrase is less.

int Phrase::compareSuffix(symbol *words, cellcount length) {
  assert(forward);
  symbol *p = forward;
  
  for (cellcount i = 0; i < length; ++i) {
    if (*p > *words) {
      return 1;
    } else if (*p < *words) {
      return -1;
    } else {
      *p++;
      *words++;
    }
  }

  return 0;

}

int Phrase::comparePrefix(symbol *words, cellcount length) {
  
  assert(back);
  symbol *p = back;
  
  for (cellcount i = 0; i < length; ++i) {
    if (*p > *words) {
      return -1;
    } else if (*p < *words) {
      return 1;
    } else {
      *p--;
      *words--;
    }
  }

  return 0;

}


// Does the phrase have a unique suffix/prefix extension?
//
// For suffix:
//   If uniqueSuffixExtensions is 1, then phrase has some unique expansion. 
//   If it is 0 then it does not.  If it is -1 then we don't know, and have
//   to calculate it from the suffix array.  Same goes for prefixes.

int Phrase::hasUniqueSuffixExtension() {

  // if we don't know, then work it out
  if (uniqueSuffixExtension == -1) {

    ensureSuffixFound();

    // pointers to the phrase's first and last occurances in te suffixArray
    symbol *fst = firstSuffix + length;
    symbol *lst = lastSuffix + length;

    // in ANYPHRASE mode, simply check the next symbol
    if (phraseMode == ANYPHRASE) {
      if ((*fst == *lst) && (*fst > LASTDELIMITER)) {
	uniqueSuffixExtension = 1;
      } else {
	uniqueSuffixExtension = 0;
      }
    }
    // in STOPWORDS mode, make sure there is a unique next content symbol
    else {
      uniqueSuffixExtension = 0;
      while ((*fst == *lst) && (*fst > LASTDELIMITER)) {
	if (*fst >= firstContentSymbol) {
	  uniqueSuffixExtension = 1;
	  break;
	}
	++fst;
	++lst;
      }
    }
  }

  return uniqueSuffixExtension;
}

int Phrase::hasUniquePrefixExtension() {

  // if we don't know, then work it out
  if (uniquePrefixExtension == -1) {

    ensurePrefixFound();

    // pointers to the phrase's first and last occurances in the prefixArray
    symbol *fst = firstPrefix - length;
    symbol *lst = lastPrefix - length;

    // in ANYPHRASE mode, simply check the next symbol
    if (phraseMode == ANYPHRASE) {
      if ((*fst == *lst) && (*fst > LASTDELIMITER)) {
	uniquePrefixExtension = 1;
      } else {
	uniquePrefixExtension = 0;
      }
    }
    // in STOPWORDS mode, make sure there is a unique next content symbol
    else {
      uniquePrefixExtension = 0;
      while ((*fst == *lst) && (*fst > LASTDELIMITER)) {
	if (*fst >= firstContentSymbol) {
	  uniquePrefixExtension = 1;
	  break;
	}
	--fst;
	--lst;
      }
    }
  }

  return uniquePrefixExtension;
}


// Expand a phrase with a unique suffix/prefix by 1 symbol
//
// Note that in STOPWORDS mode a "unique extension" means a unique
// extension that ends on a content symbol.  Although we may extend 
// such a phrase by 2 or more symbols, we will only extend it by a
// single content symbol.

int Phrase::expandUniqueSuffixExtensionByOne() {
  assert(suffixFound);
  assert(uniqueSuffixExtension == 1);

  // in ANYPHRASE mode, expand the phrase by one symbol
  if (phraseMode == ANYPHRASE) {
    increaseSuffixLength(length + 1);
  }
  // in STOPWORDS mode, expand the phrase up to the next content symbol
  else {
    symbol *next = firstSuffix + length;
    cellcount newlength = length + 1;

    while (*next < firstContentSymbol) {
      ++next;
      ++newlength;
    }
    increaseSuffixLength(newlength);
  }

  uniqueSuffixExtension = -1;
  return 0;
}

int Phrase::expandUniquePrefixExtensionByOne() {
  assert(prefixFound);
  assert(uniquePrefixExtension == 1);

  // in ANYPHRASE mode, expand the phrase by one symbol
  if (phraseMode == ANYPHRASE) {
    increasePrefixLength(length + 1);
  }
  // in STOPWORDS mode, expand the phrase to the next content symbol
  else {
    symbol *next = firstPrefix - length;
    cellcount newlength = length + 1;
    while (*next < firstContentSymbol) {
      --next;
      ++newlength;
    }
    increasePrefixLength(newlength);

  }

  uniquePrefixExtension = -1;
  return 0;
}


// Find the phrase in a suffixArray
//
// Given a suffix array, find the first and last occurances of the phrase.
// The begin and end parameters identify which cells to search (inclusive). 

int Phrase::findFirstAndLastSuffix() {
  
  return findFirstAndLastSuffix(0, inputLength-1);
}

int Phrase::findFirstAndLastSuffix(cellindex begin, cellindex end) {

  // First find *any* occurance of the phrase
  assert(begin <= end);

  // if we're only searching one cell, it is very easy
  if (begin == end) {
    assert(compareSuffix(suffixArray[begin], length) == 0);
    firstSuffix = lastSuffix = suffixArray[begin];
    firstSuffixIndex = lastSuffixIndex = begin;
    suffixFrequency = 1;
    suffixFound = 1;
    return 0;
  }

  cellindex c;
  int cmp;

  do {
    c = (end + begin) / 2;
    cmp = compareSuffix(suffixArray[c], length);
    if (cmp == 1) {
      // target phrase is lower than phrase at suffixArray[c]
      begin = c + 1;
    } else if (cmp == -1) {
      // target phrase is higher than phrase at suffixArray[c]
      end = c - 1;
    }
  } while (cmp != 0);
    
  cellindex lastbegin = c;
  cellindex lastend = end;

  // Next find the first occurance of the phrase.
  // We know that the first occurance must be between the 
  // current value of begin and the current value of c.
  end = c;

  do {
    if (begin == end) {
      c = begin;
      cmp = 0;
    } else {
      c = (begin + end) / 2;
      cmp = compareSuffix(suffixArray[c], length);
      if (cmp == 1) {
	// target phrase is lower than phrase at suffixArray[c]
	begin = c + 1;
      } else {
	assert(cmp == 0);
	// target phrase is the same as the phrase at suffixArray[c].  However,
	// to find the first occurance, suffixArray[c] must be the same as the 
	// phrase, but suffixArray[c-1] must be different.
	if (c>0) {
	  cmp = compareSuffix(suffixArray[c-1], length);
	  if (cmp == 0) {
	    end = c - 1;
	    assert(end >= begin);
	    cmp = 1;
	  } else {
	    cmp = 0;
	  }
	}
      }
    }
  } while (cmp != 0);

  // we have found the first location
  firstSuffixIndex = c;
  firstSuffix = suffixArray[c];
  
  // Next find the last occurance of the phrase.
  // We previously stored some bounds for its loccation.
  begin = lastbegin;
  end = lastend;
  
  do {
    if (begin == end) {
      c = begin;
      cmp = 0;
    } else {
      c = (begin + end) / 2;
      cmp = compareSuffix(suffixArray[c], length);
      if (cmp == -1) {
	// target phrase is greater than phrase at suffixArray[c]
	end = c - 1;
      } else {
	assert(cmp == 0);
	// target phrase is the same as the phrase at suffixArray[c].  However,
	// to find the last occurance, one of two additional condiditons must be met:
	// either c must be the very last cell in the array, or
	// suffixArray[c+1] must be different from phrase
	if (c < inputLength-1) {
	  cmp = compareSuffix(suffixArray[c+1], length);
	  if (cmp == 0) {
	    begin = c + 1;
	    cmp = -1;
	  } else {
	    cmp = 0;
	  }
	} 
      }
    }
  } while (cmp != 0);
  
  lastSuffixIndex = c;
  lastSuffix = suffixArray[c];
  suffixFrequency = lastSuffixIndex - firstSuffixIndex + 1;
  suffixFound = 1;

  return 0;
}


// Find the phrase in a prefix array
//
// Given a prefix array, find the first and last occurances of the phrase.
// The begin and end parameters identify which cells to search (inclusive). 

int Phrase::findFirstAndLastPrefix() {
  return findFirstAndLastPrefix(0, inputLength-1);
}

int Phrase::findFirstAndLastPrefix(cellindex begin, cellindex end) {

  // First find *any* occurance of the phrase
  assert(begin <= end);

  // if we're only searching one cell, it is very easy
  if (begin == end) {
    assert(comparePrefix(prefixArray[begin], length) == 0);
    firstPrefix = lastPrefix = prefixArray[begin];
    firstPrefixIndex = lastPrefixIndex = begin;
    prefixFrequency = 1;
    prefixFound = 1;
    return 0;
  }

  cellindex c;
  int cmp;

  do {
    c = (end + begin) / 2;
    cmp = comparePrefix(prefixArray[c], length);
    if (cmp == 1) {
      begin = c + 1;
    } else if (cmp == -1) {
      end = c - 1;
    }
  } while (cmp != 0);
    
  cellindex lastbegin = c;
  cellindex lastend = end;

  // Next find the first occurance of the phrase.
  // We know that the first occurance must be between the 
  // current value of begin and the cureent value of c.
  end = c;

  do {
    if (begin == end) {
      c = begin;
      cmp = 0;
    } else {
      c = (begin + end) / 2;
      cmp = comparePrefix(prefixArray[c], length);
      if (cmp == 1) {
	// target phrase is lower than phrase at prefixArray[c]
	begin = c + 1;
      } else {
	assert(cmp == 0);
	// target phrase is the same as the phrase at prefixArray[c].  However, to
	// find the first occurance, one of two additional condiditons must be met:
	// either  c == the first cell in prefix array
	// or      prefixArray[c-1] != phrase.
	if (c > 0) {
	  cmp = comparePrefix(prefixArray[c-1], length);
	  if (cmp == 0) {
	    end = c - 1;
	    assert(end >= begin);
	    cmp = 1;
	  } else {
	    cmp = 0;
	  }
	}
      }
    }
  } while (cmp != 0);

  // we have found the first location
  firstPrefixIndex = c;
  firstPrefix = prefixArray[c];

  // Next find the last occurance of the phrase.
  // We previously stored some bounds for its loccation.
  begin = lastbegin;
  end = lastend;
  
  do {
    if (begin == end) {
      c = begin;
      cmp = 0;
    } else {
      c = (begin + end) / 2;
      cmp = comparePrefix(prefixArray[c], length);
      if (cmp == -1) {
	// target phrase is greater than phrase at prefixArray[c]
	end = c - 1;
      } else {
	assert(cmp == 0);
	// target phrase is the same as the phrase at prefixArray[c].  However,
	// to find the last occurance, prefixArray[c] must be the same as the 
	// phrase, but prefixArray[c+1] must be different.
	cmp = comparePrefix(prefixArray[c+1], length);
	if (cmp == 0) {
	  begin = c + 1;
	  cmp = -1;
	} else {
	  cmp = 0;
	}
      } 
    }
  } while (cmp != 0);
  
  lastPrefixIndex = c;
  lastPrefix = prefixArray[c];
  prefixFrequency = lastPrefixIndex - firstPrefixIndex + 1;
  prefixFound = 1;

  return 0;
}


// Calculate a set of initial suffix/prefix candidates
//
// Expand the phrase to find the initial 
// set of candidates that may be extensions of that phrase,
// and add them to the end of the results vector

void Phrase::initialSuffixCandidates(vector<Phrase> &results) {

  ensureSuffixFound();
  Phrase next;
  cellindex i = firstSuffixIndex;

  // Find all the expansions of Phrase p
  while (i <= lastSuffixIndex) {

    // create new phrase exactly one longer than the current one
    next = newPhraseShortestSuffixExpansion(i);

    // If the expansion occurs more than once and is not delimited, expand it
    if ((*(next.back) > LASTDELIMITER) && (next.suffixFrequency >= minOccurs)) {
      next.expandWhileUniqueSuffixExtension();
      results.push_back(next);
    }
    
    // Move onto the next expansion
    i = next.lastSuffixIndex + 1;
  }
}


void Phrase::initialPrefixCandidates(vector<Phrase> &results) {

  ensurePrefixFound();
  Phrase next;
  cellindex i = firstPrefixIndex;

  // Find all the expansions of Phrase p
  while (i <= lastPrefixIndex) {

    // create new phrase exactly one longer than the current one
    next = newPhraseShortestPrefixExpansion(i);

    // If the expansion occurs more than once and is not delimited, expand it
    if ((*(next.forward) > LASTDELIMITER) && (next.prefixFrequency >= minOccurs)) {
      next.expandWhileUniquePrefixExtension();
      results.push_back(next);
    }
    
    // Move onto the next expansion
    i = next.lastPrefixIndex + 1;
  }
}


// Create a new phrase that is longer then the current one, but as short
// as possible.  In ANYPHRASE mode this means the new phrase is exactly
// one symbol longer than the current one, but in STOPWORD mode the expansion
// must introduce a new content symbol
//
// Note that we never expand across delimiters; if we encounter a delimiter
// then this becomes the "end" symbol of the phrase.  The calling function 
// should check for this condition.

Phrase Phrase::newPhraseShortestSuffixExpansion(cellindex i) {
  
  // create a new phrase exactly one symbol longer than the current one
  Phrase p(suffixArray[i], length + 1, SUFFIX);

  if (phraseMode == STOPWORDS) {
    // in STOPWORDS mode we must introduce a new content symbol
    while ((*(p.back) >= firstStopSymbol) &&
	   (*(p.back) <= lastStopSymbol)) {
      p.increaseSuffixLength(p.length + 1);
    }
  }

  p.findFirstAndLastSuffix(i, lastSuffixIndex);
  return p;
}

Phrase Phrase::newPhraseShortestPrefixExpansion(cellindex i) {
  
  // create a new phrase exactly one symbol longer than the current one
  Phrase p(prefixArray[i], length + 1, PREFIX);

  if (phraseMode == STOPWORDS) {
    // in STOPWORDS mode we must introduce a new content symbol
    while ((*(p.forward) >= firstStopSymbol) &&
	   (*(p.forward) <= lastStopSymbol)) {
      p.increasePrefixLength(p.length + 1);
    }
  }

  p.findFirstAndLastPrefix(i, lastPrefixIndex);
  return p;
}


// Expand a phrase until it no longer has a unique suffix extension
// 
// If the phrase only occurs once in the suffix array, do nothing.
//
// If we are in stopwords mode then we expand if there is a unique 
// suffix expansion and this expansion is a content word.  We do this
// with the content_len variable, which holds the length of the longest
// phrase ending in a content word. 

int Phrase::expandWhileUniqueSuffixExtension() {
  ensureSuffixFound();

  // if the phrase occurs only once, do nothing
  if (suffixFrequency < minOccurs)
    return 0;

  // if we already know there's no unique suffix extension, do nothing
  if (uniqueSuffixExtension == 0)
    return 0;

  // count the length over which the cells are the same
  // and no delimiter is crossed.
  symbol *fst = firstSuffix;
  symbol *lst = lastSuffix;
  cellindex len = 0;
  cellindex content_len = 0;

  // Calculate the length over which the phrases match
  while((*fst == *lst) && (*fst > LASTDELIMITER)) {
    ++len;
    if (*fst > lastStopSymbol) content_len = len;
    ++fst;
    ++lst;
  }

  // in ANYPHRASE mode, expand the phrase for as long as possible
  if (phraseMode == ANYPHRASE) {
    if (len > length) {
      increaseSuffixLength(len);
    }
  } 
  // in STOPWORDS mode, expand only as far as the last contentword
  else {
    if (content_len > length) {
      increaseSuffixLength(content_len);
    }
  }

  uniqueSuffixExtension = 0;
  return 0;
}

int Phrase::expandWhileUniquePrefixExtension() {
  ensurePrefixFound();

  // if the phrase occurs only once, do nothing
  if (prefixFrequency < minOccurs)
    return 0;

  // if we already know there's no unique extension, do nothing
  if (uniquePrefixExtension == 0)
    return 0;

  // count the length over which the cells are the same
  // and no delimiter is crossed.
  symbol *fst = firstPrefix;
  symbol *lst = lastPrefix;
  cellindex len = 0;
  cellindex content_len = 0;

  // Calculate the length over which the phrases match
  while((*fst == *lst) && (*fst > LASTDELIMITER)) {
    ++len;
    if (*fst > lastStopSymbol) content_len = len;
    --fst;
    --lst;
  }

  // in ANYPHRASE mode, expand the phrase for as long as possible
  if (phraseMode == ANYPHRASE) {
    if (len > length) {
      increasePrefixLength(len);
    }
  }
  // in STOPWORDS mode, expand only as far as the last contentword
  else {
    if (content_len > length) {
      increasePrefixLength(content_len);
    }
  }

  uniquePrefixExtension = 0;
  return 0;
}

// Compare the length of two phrases
//
// Given two phrases, return true if the first is shorter/longer,
// otherwise return false.  For use in various sort functions.

bool isShorter(Phrase p1, Phrase p2) {
  if (p1.length < p2.length) {
    return true;
  }
  return false;
}

bool isLonger(Phrase p1, Phrase p2) {
  if (p1.length > p2.length) {
    return true;
  }
  return false;
}


