/**************************************************************************
 *
 * stemmer.cpp -- The stemmer/case folder
 * Copyright (C) 1994  Neil Sharman
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 **************************************************************************/

#include "sysfuncs.h"
#include "stemmer.h"
#include "lovinstem.h"
#include "simplefrenchstem.h"
#include "unitool.h"

#ifdef ENABLE_ACCENTFOLD
/* [JFG - Mar 06: Accent folding patch] */
#include "unac.h"
#endif

#define LOVINSTEMMER        0
#define SIMPLEFRENCHSTEMMER 1


/* decode the utf-8 encoded unicode, casefold and then recode 
 * making sure the final length doesn't exceed the original
 * length */
static void mgpp_unicode_casefold (u_char *word) {
  unsigned short out[256]; /* temp space */
  int i;
  int len;

  /* decode */
  utf8_word_to_unicode (word, out, 255);
  len = out[0];

  /* casefold and simplify-fold */
  for (i=0; i<len; ++i) {
    out[i+1] = unicode_tosimplified(unicode_tolower(out[i+1]));
  }

  /* re-code */
  //unicode_to_utf8_word (out, word, word[0]+1);
  // The max_output_length (3rd param) for unicode_to_utf8 is the max length of
  // the string, not including the length
  unicode_to_utf8_word (out, word, word[0]);
}

#ifdef ENABLE_ACCENTFOLD
/* [JFG - Mar 06: Accent folding patch] */
/* =========================================================================
 * Function: unicode_accentfold
 
 * Description: remove accents from characters
 * Input: a word string with the length in the first byte
 * Output: the unaccented word
 * ========================================================================= */
void mgpp_unicode_accentfold (unsigned char *word) {      
  size_t unac_size = 0;
  char *unac = NULL;


  unac_string("utf-8", (char*)word+1, word[0], &unac, &unac_size);
  strncpy((char*)word+1, unac, word[0]+1);
  word[0] = unac_size;
  
  free(unac);
  return;
}
#endif
      
int mgpp_stemmernumber (u_char *stemmerdescription) {
  u_char descript[MAX_STEM_DESCRIPTION_LEN];
  int i;

  /* copy and case-fold the description */
  for (i=0; (stemmerdescription[i] != '\0') && 
	 (i < MAX_STEM_DESCRIPTION_LEN-1); ++i)
    descript[i] = tolower (stemmerdescription[i]);
  descript[i] = '\0';

  /* map the description to its number */

  if ((strcmp ((char *) descript,  "0") == 0) ||
      (strcmp ((char *) descript, "english") == 0) ||
      (strcmp ((char *) descript, "lovin") == 0))
    return LOVINSTEMMER;

  if ((strcmp ((char *) descript, "1") == 1) ||
      (strcmp ((char *) descript, "french") == 0) ||
      (strcmp ((char *) descript, "simplefrench") == 0))
    return SIMPLEFRENCHSTEMMER;

  return -1;
}



/*
 * Method 0 - Do not stem or case fold.
 * Method 1 - Case fold.
 * Method 2 - Stem.
 * Method 3 - Case fold and stem.
 * Method 4 - Accent fold
 * Method 5 - Accent fold and case fold
 * Method 6 - Accent fold and stem
 * Method 7 - Accent fold, stem and case fold

 * The stemmer number should be obtained using
 * the stemmernumber function above.
 */
void 
mgpp_stemmer (int method, int stemmer, u_char *word) {
  if (method & STEM_CaseFolding) {
    mgpp_unicode_casefold (word);
  }

#ifdef ENABLE_ACCENTFOLD
  if (method & STEM_AccentFolding) {
    mgpp_unicode_accentfold (word);
  }
#endif

  if (method & STEM_Stemming) {
    switch (stemmer) {
    case LOVINSTEMMER: lovinstem (word);
      break;
    case SIMPLEFRENCHSTEMMER: simplefrenchstem (word);
      break;
    }
  }
}
