/**********************************************************************
 *
 * text_t.cpp -- a simple 16-bit character string class
 * Copyright (C) 1999  The New Zealand Digital Library Project
 *
 * A component of the Greenstone digital library software
 * from the New Zealand Digital Library Project at the
 * University of Waikato, New Zealand.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * $Id: text_t.cpp 26520 2012-11-26 11:06:10Z davidb $
 *
 *********************************************************************/

#include "text_t.h"

#if defined(GSDL_USE_OBJECTSPACE)
#  include <ospace\std\algorithm>
#elif defined(GSDL_USE_STL_H)
#  if defined(GSDL_USE_ALGO_H)
#    include <algo.h>
#  else
#    include <algorithm.h>
#  endif
#else
#  include <algorithm>
#endif

#ifdef HAVE_CONFIG_H
# ifdef __WIN32__
#  include "win32cfg.h"
# else
#  include "config.h"
# endif
#endif

#include <cstring>

#include "unitool.h"
#include <iostream>

const text_t g_EmptyText("");

////////////////////////////////////
// text_t methods
////////////////////////////////////

// new stream converter ...
ostream& operator<< (ostream &o, const text_t &text)
{
  text_t::const_iterator ithere = text.begin();
  text_t::const_iterator itend = text.end();

  while (ithere != itend)
    {
      if (*ithere < 256)
	{
	  o << (unsigned char)(*ithere);
	}
      else 
	{
	// put a space or a question mark depending on what
	// the character is. Question marks tell the user that
	// they are missing some information.
	if (is_unicode_space (*ithere)) 
	  o << ' ';
	else 
	  o << '?';
      }
      ++ithere;
    }

  return o;
}

text_t::text_t () 
{
  setencoding(0);
  clear ();
}

text_t::text_t (int i) 
{
  setencoding(0);
  clear ();
  appendint (i);
}

text_t::text_t (const char *s) 
{ 
  setencoding(0);
  clear ();
  appendcstr (s);
}

text_t::text_t (const char *s, size_type nLength)
{
  setencoding(0);
  clear ();
  appendcarr(s, nLength);
}


void text_t::append (const text_t &t) 
{
  text.insert(text.end(), t.begin(), t.end());
}

void text_t::appendrange (iterator first, iterator last) 
{
  text.insert(text.end(), first, last);
}

void text_t::appendrange (const_iterator first, const_iterator last) 
{
  text.insert(text.end(), first, last);
}

void text_t::appendint (int i)
{
  // deal with zeros and negatives
  if (i == 0) 
    {
      text.push_back('0');
      return;
    }
  else if (i < 0)
    {
      text.push_back('-');
      i *= -1;
    }

  // get a buffer for the conversion
  int maxbuflen = sizeof(int)*3;
  char *buf = new char[maxbuflen];
  int len = 0;
  
  // get the number in reverse
  while (i > 0)
    {
      buf[len++] = '0'+ (i%10);
      i = i/10;
    }

  // reverse the number
  while (len > 0)
    {
      text.push_back(buf[--len]);
    }

  delete []buf;
}

int text_t::getint () const
{
  int i = 0;
  int mult = 1; // become -1 for negative numbers

  const_iterator here = text.begin();
  const_iterator end = text.end();
  
  // do plus and minus signs
  if (here != end)
    {
      if (*here == '-')
	{
	  mult = -1;
	  ++here;
	}
      else if (*here == '+')
	{
	  mult = 1;
	  ++here;
	}
    }

  // deal with the number
  while ((here != end) && (*here >= '0') && (*here <= '9'))
    {
      i = 10*i + (*here - '0');
      ++here;
    }

  i *= mult;
  return i;
}

unsigned long text_t::getulong () const
{
  unsigned long i = 0;

  const_iterator here = text.begin();
  const_iterator end = text.end();

  while ((here != end) && (*here >= '0') && (*here <= '9'))
    {
      i = 10*i + (*here - '0');
      ++here;
    }

  return i;
}

void text_t::appendcarr (const char *s, size_type len)
{
  unsigned char *us = (unsigned char *)s;
  if (text.capacity() < (text.size() + len + 2)) {
    text.reserve((2*text.size()) + len + 2);
  }

  while (len > 0) 
    {
      text.push_back (*us); // append this character
      ++us;
      --len;
    }
}

void text_t::appendcstr (const char *s) 
{
  size_t len = strlen(s);
  if (text.capacity() < (text.size() + len + 2)) {
    text.reserve((2*text.size()) + len + 2);
  }
  
  unsigned char *us = (unsigned char *)s;
  while (*us != '\0') 
    {
      text.push_back (*us); // append this character
      ++us;
    }
}


// strings returned from getcarr and getcstr become the callers 
// responsibility and should be deallocated with "delete []"

char *text_t::getcarr(size_type &len) const
{
  unsigned char *cstr = new unsigned char[size()];
  len = 0;

  const_iterator ithere = begin();
  const_iterator itend = end();
  while (ithere != itend)
    {
      if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
      else {
	// put a space or a question mark depending on what
	// the character is. Question marks tell the user that
	// they are missing some information.
	if (is_unicode_space (*ithere)) cstr[len] = ' ';
	else cstr[len] = '?';
      }
      ++len;
      ++ithere;
    }

  return (char *)cstr;
}

char *text_t::getcstr() const
{
  unsigned char *cstr = new unsigned char[size() + 1];
  const_iterator ithere = begin();
  const_iterator itend = end();
  int len = 0;

  while (ithere != itend)
    {
      if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
      else {
	// put a space or a question mark depending on what
	// the character is. Question marks tell the user that
	// they are missing some information.
	if (is_unicode_space (*ithere)) cstr[len] = ' ';
	else cstr[len] = '?';
      }
      ++len;
      ++ithere;
    }

  cstr[len] = '\0';

  return (char *)cstr;
}


int text_t::replace(text_t toreplace, text_t replacement)
{
  // Get the beginning and end of the current text
  text_t::iterator text_begin = text.begin(), text_end = text.end();
  int count = 0;
  text_t new_text, temp_text;

  // Loop through and grab the text off the end
  while (text_begin < text_end)
  {
    // Find where the next toreplace is
    text_t::iterator next_toreplace = findword(text_begin, text_end, toreplace);

    // We've found a match
    if (next_toreplace != text_end)
    {
      new_text.append(substr(text_begin, next_toreplace));
      new_text.append(replacement);
      count++;
      text_begin = next_toreplace + toreplace.size();
    }
    // We haven't found a match
    else
    {
      new_text.append(substr(text_begin, text_end));
      text_begin = text_end;
    }
  }

  text.clear();
  text = new_text.text_as_usvector();
  return count;
}


// general functions which work on text_ts

// find a character within a range
text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last, 
				 unsigned short c)
{
  while (first != last)
    {
      if (*first == c) break;
      ++first;
    }
  return first;
}

text_t::iterator findchar (text_t::iterator first, text_t::iterator last, 
			   unsigned short c)
{
  while (first != last)
    {
      if (*first == c) break;
      ++first;
    }
  return first;
}

text_t::iterator findlastchar (text_t::iterator first, text_t::iterator last_plus_one, 
			   unsigned short c)
{
  text_t::iterator current = (last_plus_one != first) ? last_plus_one - 1 : first;
  while (current != first) {
    if (*current == c) break;
    --current;
  }
  if (current == first) {
    if (*current == c) return current;
    return last_plus_one;
  }

  return current;
}

text_t::const_iterator findword (text_t::const_iterator first, 
				 text_t::const_iterator last, 
				 const text_t& word)
{
  text_t::const_iterator word_begin = word.begin();
  text_t::const_iterator word_end = word.end();

  while (first != last)
    {
      text_t::const_iterator char_match = first;
      text_t::const_iterator word_here = word_begin;
      while (word_here != word_end && char_match != last)
	{
	  if (*char_match != *word_here)
	    {
	      break;
	    }
	  ++char_match;
	  ++word_here;
	}
      if (word_here==word_end)
	{
	  return first;
	}
      ++first;
    }
  return last; // get to here only if there is no match
}

text_t::iterator findword (text_t::iterator first, 
			   text_t::iterator last, 
			   const text_t& word)
{
  text_t::const_iterator word_begin = word.begin();
  text_t::const_iterator word_end = word.end();

  while (first != last)
    {
      text_t::iterator char_match = first;
      text_t::const_iterator word_here = word_begin;
      while (word_here != word_end && char_match != last)
	{
	  if (*char_match != *word_here)
	    {
	      break;
	    }
	  ++char_match;
	  ++word_here;
	}
      if (word_here==word_end)
	{
	  return first;
	}
      ++first;
    }
  return last; // get to here only if there is no match
}

// get a string up to the next delimiter (which is skipped)
text_t::const_iterator getdelimitstr (text_t::const_iterator first, 
				      text_t::const_iterator last,
				      unsigned short c, text_t &outstr)
{
  text_t::const_iterator here = first;
  here = findchar (first, last, c);
  outstr.clear();
  outstr.appendrange (first, here);
  if (here != last) ++here; // skip c
  return here;
}

text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
				unsigned short c, text_t &outstr)
{
  text_t::iterator here = first;
  here = findchar (first, last, c);
  outstr.clear();
  outstr.appendrange (first, here);
  if (here != last) ++here; // skip c
  return here;
}

text_t::const_iterator getdelimitstr (text_t::const_iterator first, text_t::const_iterator last,
				      text_t w, text_t &outstr)
{
  text_t::const_iterator here = first;
  here = findword (first, last, w);
  outstr.clear();
  outstr.appendrange (first, here);
  if (here != last) here += w.size(); // skip w
  return here;
}

// split a string with a character
void splitchar (text_t::const_iterator first, text_t::const_iterator last,
		unsigned short c, text_tset &outlist)
{
  outlist.erase(outlist.begin(), outlist.end());

  text_t t;

  while (first != last)
    {
      first = getdelimitstr (first, last, c, t);
      outlist.insert (t);
    }
}

void splitchar (text_t::const_iterator first, text_t::const_iterator last,
		unsigned short c, text_tlist &outlist)
{
  outlist.erase(outlist.begin(), outlist.end());

  text_t t;

  while (first != last)
    {
      first = getdelimitstr (first, last, c, t);
      outlist.push_back (t);
    }
}

void splitchar (text_t::const_iterator first, text_t::const_iterator last,
		unsigned short c, text_tarray &outlist)
{
  outlist.erase(outlist.begin(), outlist.end());

  text_t t;

  while (first != last)
    {
      first = getdelimitstr (first, last, c, t);
      outlist.push_back (t);
    }
}

void splitword (text_t::const_iterator first, text_t::const_iterator last,
		text_t w, text_tlist &outlist)
{
  outlist.erase(outlist.begin(), outlist.end());

  text_t t;

  while (first != last)
    {
      first = getdelimitstr (first, last, w, t);
      outlist.push_back (t);
    }
}

// join a string using a character
void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
{
  outtext.clear ();

  text_tset::const_iterator here = inlist.begin ();
  text_tset::const_iterator end = inlist.end ();

  if (here != end) {
    outtext += *here; ++here;
    while (here != end) {
      outtext.push_back (c);
      outtext += *here;
      ++here;
    }
  }
}

void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
{
  outtext.clear ();

  text_tlist::const_iterator here = inlist.begin ();
  text_tlist::const_iterator end = inlist.end ();
  if (here != end) {
    outtext += *here; ++here;
    while (here != end) {
      outtext.push_back (c);
      outtext += *here;
      ++here;
    }
  }
}

void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
{
  outtext.clear ();

  text_tarray::const_iterator here = inlist.begin ();
  text_tarray::const_iterator end = inlist.end ();
  if (here != end) {
    outtext += *here; ++here;
    while (here != end) {
      outtext.push_back (c);
      outtext += *here;
      ++here;
    }
  }
}

void joinchar (const text_tlist &inlist, const text_t &c, text_t &outtext)
{
  outtext.clear ();

  text_tlist::const_iterator here = inlist.begin ();
  text_tlist::const_iterator end = inlist.end ();
  if (here != end) {
    outtext += *here; ++here;
    while (here != end) {
      outtext += c;
      outtext += *here;
      ++here;
    }
  }
}

void joinchar (const text_tset &inlist, const text_t &c, text_t &outtext)
{
  outtext.clear ();

  text_tset::const_iterator here = inlist.begin ();
  text_tset::const_iterator end = inlist.end ();
  if (here != end) {
    outtext += *here; ++here;
    while (here != end) {
      outtext += c;
      outtext += *here;
      ++here;
    }
  }
}

void joinchar (const text_tarray &inlist, const text_t &c, text_t &outtext)
{
  outtext.clear ();

  text_tarray::const_iterator here = inlist.begin ();
  text_tarray::const_iterator end = inlist.end ();
  if (here != end) {
    outtext += *here; ++here;
    while (here != end) {
      outtext += c;
      outtext += *here;
      ++here;
    }
  }
}

// count the occurances of a character within a range
int countchar (text_t::const_iterator first, text_t::const_iterator last,
	       unsigned short c)
{
  int count = 0;
  while (first != last) {
    if (*first == c) ++count;
    ++first;
  }
  return count;
}

// return a substring of string from first up to but not including last
text_t substr (text_t::const_iterator first, text_t::const_iterator last) {

  text_t substr; substr.reserve(last - first + 2);
  while (first != last) {
    substr.push_back(*first);
    ++first;
  }
  return substr;
}


// convert to lowercase
void lc (text_t::iterator first, text_t::iterator last) {
  while (first != last) {
    *first = unicode_tolower(*first);
    ++first;
  }
}

// convert to uppercase
void uc (text_t::iterator first, text_t::iterator last) {
  while (first != last) {
    *first = unicode_toupper(*first);
    ++first;
  }
}


// checks to see if it is a number (i.e. contains only 0-9)
bool is_number (const text_t &text) {

  text_t::const_iterator here = text.begin();
  text_t::const_iterator end = text.end();

  while (here != end) {
    if ((*here!='0') && (*here!='1') && (*here!='2') &&
	(*here!='3') && (*here!='4') && (*here!='5') &&
	(*here!='6') && (*here!='7') && (*here!='8') &&
	(*here!='9')) return false;
    ++here;
  }
  return true;
}


// checks to see if the text has any letters or digits
bool has_unicode_letdig (const text_t &text) {
  if (text.empty()) return false;
  
  text_t::const_iterator here = text.begin();
  text_t::const_iterator end = text.end();
  while (here != end) {
    if (is_unicode_letdig (*here)) return true;
    ++here;
  }

  return false;
}

// checks to see if a text_t starts with the specified prefix
bool starts_with(const text_t& text, const text_t& prefix) {
  if (prefix.empty()) return true;
  if (text.empty() || text.size()<prefix.size()) return false;
  text_t substring = substr(text.begin(), text.begin()+prefix.size());
  return substring == prefix;
}
// checks to see if a text_t ends with the specified suffix
bool ends_with(const text_t& text, const text_t& suffix) {
  if (suffix.empty()) return true;
  if (text.empty() || text.size() < suffix.size()) return false;
  text_t substring = substr(text.end()-suffix.size(),text.end());
  return substring == suffix;

}

//Trims the whitespace off the beginning and end of a given string
text_t trim (const text_t& text) {

  if(text.size() == 0) {
    return text;
  }

  text_t::const_iterator firstLetter = text.begin();
  text_t::const_iterator lastLetter = text.end();
  
  //Find the start
  while (firstLetter != lastLetter) {
    if(!is_unicode_space(*firstLetter)) {
      break;
    }
    firstLetter++;
  }

  if (firstLetter == lastLetter) {
    // text was all white space
    return "";
  }

  //Find the end
  lastLetter-=1;
  while (lastLetter != firstLetter) {
    if(!is_unicode_space(*lastLetter)) {
	  break; 
	}
    lastLetter--;
  }	

  return substr(firstLetter, lastLetter+1);
}

////////////////////////////////////
// convertclass methods
////////////////////////////////////

// conversion classes used for getting information in to and out of
// the text_t class.

convertclass::convertclass () 
{
  // nothing to do
}

convertclass::~convertclass () 
{
  // nothing to do
}

void convertclass::reset ()
{
  // nothing to do
}


////////////////////////////////////
// inconvertclass methods
////////////////////////////////////

// convert from a char stream to the text_t class
// the default version assumes the input is a ascii
// character array

inconvertclass::inconvertclass () 
{
  start = NULL;
  len = 0;
}

inconvertclass::~inconvertclass () 
{
  // nothing to do
}


void inconvertclass::reset ()
{
  start = NULL;
  len = 0;
}

void inconvertclass::setinput (char *thestart, size_t thelen)
{
  start = thestart;
  len = thelen;
}

void inconvertclass::convert (text_t &output, status_t &status)
{
  output.clear();

  if (start == NULL || len == 0)
    {
      status = finished;
      return;
    }

  if (output.capacity() < len + 2)
    output.reserve(len + 2);
  
  // don't want any funny sign conversions happening
  unsigned char *here = (unsigned char *)start;
  while (len > 0) 
    {
      output.push_back (*here); // append this character
      ++here;
      --len;
    }

  start = (char *)here; // save current position
  status = finished;
}

// will treat the text_t as a 8-bit string and convert
// it to a 16-bit string using the about convert method.
text_t inconvertclass::convert (const text_t &t) {
  text_t out;
  text_t tmpout;
  status_t status;
  text_t::const_iterator here = t.begin();
  text_t::const_iterator end = t.end();
  unsigned char cbuf[256];
  size_t cbuflen = 0;
 
  out.clear();
  if (out.capacity() < t.size() + 2)
    out.reserve(t.size() + 2);
  while (here != end) {
    while (here != end && cbuflen < 256) {
      cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
      ++here;
    }

    if (cbuflen > 0) {
      setinput ((char *)cbuf, cbuflen);
      status = unfinished;
      while (status == unfinished) {
	convert (tmpout, status);
	out += tmpout;
      }
      cbuflen = 0;
    }
  }

  out.setencoding (0); // unicode

  return out;
}

// an instance of the default inconvertclass to do simple
// conversions. Note that any functions that use this are
// not reentrant. If a function needs to be reentrant it
// should declare its own instance.
inconvertclass ascii2text_t;


////////////////////////////////////
// outconvertclass methods
////////////////////////////////////

// Convert from a text_t class to a char stream
// This default version assumes the output is a ascii
// character array. If you set the output stream you
// can use this class to output to a stream using the
// << operator. The << operator can also be conveniently
// used to set the output stream by doing something like
//
// cout << text_t2ascii << text_tstr << anothertext_tstr;
//
outconvertclass::outconvertclass ()
{
  input = NULL;
  outs = NULL;
}

outconvertclass::~outconvertclass () 
{
  // nothing to do
}


void outconvertclass::reset ()
{
  input = NULL;
  outs = NULL;
}

void outconvertclass::setinput (text_t *theinput)
{
  input = theinput;
  if (input != NULL) texthere = input->begin();
}

void outconvertclass::setdata(text_t *theinput, text_t::iterator thetexthere)
{
  input = theinput;     
  texthere = thetexthere;
}
 
void outconvertclass::convert (char *output, size_t maxlen, 
		      size_t &len, status_t &status)
{
  if (input == NULL || output == NULL)
    {
      status = finished;
      return;
    }

  // don't want any funny sign conversions happening
  unsigned char *uoutput = (unsigned char *)output;
  text_t::iterator textend = input->end();
  len = 0;
  while ((len < maxlen) && (texthere != textend)) 
    {
      if (*texthere < 256) *uoutput = (unsigned char)(*texthere);
      else {
	// put a space or a question mark depending on what
	// the character is. Question marks tell the user that
	// they are missing some information.
	if (is_unicode_space (*texthere)) *uoutput = ' ';
	else *uoutput = '?';
      }
      ++uoutput;
      ++len;
      ++texthere;
    }
  
  if (texthere == textend) status = finished;
  else status = unfinished;
}

// will convert the 16-bit string to a 8-bit stream
// and place the result in a text_t. This method uses
// the above convert function.
text_t outconvertclass::convert (const text_t &t) {
  text_t out;
  unsigned char cbuf[256];
  size_t cbuflen = 0;
  status_t status = unfinished;
  
  out.clear(); 
  if (out.capacity() < t.size() + 2)
    out.reserve(t.size() + 2);
  setinput ((text_t *)&t); // discard constant
  while (status == unfinished) {
    convert ((char *)cbuf, 256, cbuflen, status);
    out.appendcarr ((char *)cbuf, cbuflen);
  }

  out.setencoding (1); // other encoding
  
  return out;
}


void outconvertclass::setostream (ostream *theouts)
{
  outs = theouts;
}

ostream *outconvertclass::getostream ()
{
  return outs;
}




// an instance of the default outconvertclass to do simple
// conversions
outconvertclass text_t2ascii;



// stream operators for the output class

outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
{
  outconverter.setostream(&theouts);
  return outconverter;
}


#define STREAMBUFSIZE 256
outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
{
  ostream *outstream = outconverter.getostream();

  if (outstream == NULL) return outconverter;

  char outbuf[STREAMBUFSIZE];
  size_t len;
  outconvertclass::status_t status = outconvertclass::unfinished;

  // assume that there is no data needing converting
  // left in the converter
  outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion

  while (status == outconvertclass::unfinished)
    {
      outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
      if (len > 0) outstream->write(outbuf, len);
    }

  return outconverter;
}
