/**************************************************************************
 *
 * mgpp_weights_build.cpp -- Program to build the document weights file
 * Copyright (C) 1999  Rodger McNab
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 **************************************************************************/

#define _XOPEN_SOURCE 1
// This was added for Solaris, but it makes things worse on Solaris for me...
// #define _XOPEN_SOURCE_EXTENDED 1

/* getopt is in posix.2, so cygwin should have it in unistd, but doesn't */
#if defined (__WIN32__) || defined (__CYGWIN__)
# include "getopt_old.h"
#else
# include <unistd.h>
#endif

#include "UCArray.h"
#include "sysfuncs.h"
#include "memlib.h"
#include "messages.h"
#include "local_strings.h"
#include "bitio_gen.h"
#include "bitio_m_stdio.h"
#include "mg_files.h"
#include "locallib.h"
#include "invf.h"
#include "FIvfLevelInfo.h"
#include "FragLevelConvert.h"

#if defined(GSDL_USE_OBJECTSPACE)
#  include <ospace\std\map>
#elif defined(GSDL_USE_STL_H)
#  include <map.h>
#else
#  include <map>
#endif

#define MAXBITS (sizeof(mg_u_long) * 8)

struct WBTagPtr {
  mg_u_long tagNum;
  mg_u_long tagPtr;
  mg_u_long fragOccur;
  
  WBTagPtr () {
    tagNum = 0;
    tagPtr = 0;
    fragOccur = 0;
  }
};

// maps tags to tag information
typedef map<UCArray, WBTagPtr, DictLTUCArray> WBTagDict;

typedef vector<float> Weights;


static void ReadTagDict (const invf_dict_header &idh,
			 FILE *dictFile,
			 FILE *invfIdxFile,
			 WBTagDict &tagDict) {
  tagDict.erase (tagDict.begin(), tagDict.end());

  // seek to the start of the tag information
  fseek (dictFile, idh.tag_dict_start, SEEK_SET);
  fseek (invfIdxFile, sizeof(mg_u_long) +
	 idh.word_dict_size*sizeof(mg_u_long), SEEK_SET);
  
  mg_u_long tagNum;
  mg_u_long tagPtr;
  dict_el thisEl;
  for (tagNum = 0; tagNum < idh.tag_dict_size; ++tagNum) {
    thisEl.Read (dictFile);
    ReadUL (invfIdxFile, tagPtr);
    tagDict[thisEl.el].tagNum = tagNum;
    tagDict[thisEl.el].tagPtr = tagPtr;
    tagDict[thisEl.el].fragOccur = thisEl.frag_occur;
  }
}

static void AddWeight (Weights &w,
		       mg_u_long levelDocNum,
		       mg_u_long termFreq,
		       float idf) {
  double weight = termFreq * idf;
  w[levelDocNum-1] += weight * weight;
}

static void GenerateLevelWeights (const invf_dict_header &idh,
				  const invf_file_header &ifh,
				  mg_u_long numLevelDocs,
				  mg_u_long levelNum,
				  FILE *dictFile,
				  FILE *invfFile,
				  FILE *invfIdxFile,
				  const FragLevelConvert &fragLevelConvert,
				  Weights &w) {
  // pre-allocate the right number of weights
  w.erase (w.begin(), w.end());
  w.insert (w.end(), (Weights::size_type)numLevelDocs, (float)0.0);

  double logN = log ((double) numLevelDocs);

  // reset the files
  fseek (dictFile, idh.word_dict_start, SEEK_SET);
  fseek (invfIdxFile, sizeof (mg_u_long), SEEK_SET);
  
  // process each word adding its contributions to the document weights
  mg_u_long wordNum;
  mg_u_long wordStart;
  word_dict_el wordEl;
  wordEl.SetNumLevels (idh.num_levels);
  for (wordNum=0; wordNum<idh.word_dict_size; ++wordNum) {
#ifndef SILENT
    // give a little feedback every 4096 words
    if ((wordNum & 0xfff) == 0) fprintf (stderr, ".");
#endif
    
    wordEl.Read (dictFile, idh.num_levels);
    ReadUL (invfIdxFile, wordStart);

    float idf = logN - log ((double) wordEl.levelFreqs[levelNum]);

    // seek to the appropriate place in the inverted file
    fseek (invfFile, wordStart, SEEK_SET);
    stdio_bitio_buffer buffer (invfFile);
    
    mg_u_long B = BIO_Bblock_Init (idh.num_frags, wordEl.frag_occur);
    mg_u_long fragNum = 0;
    mg_u_long levelDocNum = 0;
    mg_u_long lastLevelDocNum = 0;
    mg_u_long termFreq = 0;
    mg_u_long checkLevelFreq = 0;
    
    mg_u_long count, i;
    for (i=0; i<wordEl.frag_occur; ++i) {
      fragNum += buffer.bblock_decode (B, NULL);
      if (!ifh.word_level_index) count = buffer.gamma_decode (NULL);
      else count = 1;

      if (fragNum > idh.num_frags)
	FatalError (1, "fragNum = %d, "
		    "number of fragments = %d\n"
		    "wordNum = %d\n"
		    "i = %d, frag_occur = %d\n",
		    fragNum, idh.num_frags, wordNum, i, wordEl.frag_occur);

      if (!fragLevelConvert.FragToLevel (fragNum, levelDocNum))
	FatalError (1, "could not convert fragment number %d in level %d", fragNum, levelNum);

      if (levelDocNum == 0 || levelDocNum > numLevelDocs)
	FatalError (1, "bad level document number %d in level %d", levelDocNum, levelNum);
      
      if (levelDocNum != lastLevelDocNum) {
	// new level document
	if (lastLevelDocNum > 0) {
	  AddWeight (w, lastLevelDocNum, termFreq, idf);
	  ++checkLevelFreq;
	}
	lastLevelDocNum = levelDocNum;
	termFreq = 0;
      }
      termFreq += count;
    }

    if (lastLevelDocNum > 0) {
      AddWeight (w, lastLevelDocNum, termFreq, idf);
      ++checkLevelFreq;
    }

    if (checkLevelFreq != wordEl.levelFreqs[levelNum]) {
      cerr << "bad level freq at level " <<levelNum<<" "<< checkLevelFreq << " != "
	   << wordEl.levelFreqs[levelNum] << ", word \""
	   << wordEl.el << "\" (" << wordNum << ")\n";
      exit (1);
    }
    
    buffer.done();
  }

  if (w.size() != numLevelDocs) 
    FatalError (1, "wrong number of weights created %d != %d",
		w.size(), numLevelDocs);
}

static void WriteExactWeights (FILE *weightsFile,
			       mg_u_long &diskPtr,
			       const Weights &w) {
  diskPtr = ftell(weightsFile);

  Weights::const_iterator here = w.begin();
  Weights::const_iterator end = w.end();
  while (here != end) {
//      cout << *here << "\n";
    WriteF (weightsFile, sqrt (*here));
    ++here;
  }
}

static void WriteApproxWeights (FILE *approxWeightsFile,
				mg_u_long &diskPtr,
				const Weights &w,
				unsigned char bits) {
  diskPtr = ftell(approxWeightsFile);

  // calculate L, U and B
  double L = 1e300;
  double U = 0;
  float wgt;
  Weights::const_iterator here = w.begin();
  Weights::const_iterator end = w.end();
  while (here != end) {
    wgt = sqrt (*here);
    if (wgt > U) U = wgt;
    if (wgt > 0 && wgt < L) L = wgt;
    ++here;
  }

  double B = pow (U / L, pow (2.0, -(double) bits));

#ifndef SILENT
  fprintf (stderr, "L = %f\n", L);
  fprintf (stderr, "U = %f\n", U);
  fprintf (stderr, "B = %f\n", B);
#endif
  
  WriteUC (approxWeightsFile, bits);
  WriteD (approxWeightsFile, L);
  WriteD (approxWeightsFile, B);
  

  mg_u_long max = (bits == 32) ? 0xffffffff : (1 << bits) - 1;;
  mg_u_long i=0, buf=0, pos=0;
  here = w.begin();
  end = w.end();
  while (here != end) {
    mg_u_long fx;
    wgt = sqrt (*here);
    if (wgt == 0) {
      wgt = L;
#ifndef SILENT
      Message ("Warning: Document %d had a weight of 0.", i);
#endif
    }
    fx = (mg_u_long) floor (log (wgt / L) / log (B));

    if (fx > max) fx = max;

    buf |= (fx << pos);
    pos += bits;

    if (pos >= MAXBITS) {
      WriteUL (approxWeightsFile, buf);
      buf = fx >> (bits - (pos - MAXBITS));
      pos = pos - MAXBITS;
    }

    ++here; ++i;
  }

  // write out the last bits
  if (pos > 0) WriteUL (approxWeightsFile, buf);
}

int main (int argc, char **argv) {
  unsigned char bits = 8;
  char *filename = (char*)"";
  int ch;
  opterr = 0;
  msg_prefix = argv[0];

  while ((ch = getopt (argc, argv, "f:d:b:h")) != -1) {
    switch (ch) {
    case 'f':		// input file
      filename = optarg;
      break;
    case 'd':
      set_basepath (optarg);
      break;
    case 'b':
      bits = atoi (optarg);
      if (bits > 32) {
	fprintf (stderr, "b may only take values 0-32\n");
	exit (1);
      }
      break;
    case 'h':
    case '?':
      fprintf (stderr, "usage: %s [-f input_file]"
	       "[-d data directory] [-b bits] [-h]\n", argv[0]);
      exit (1);
    }
  }

  
  // open the dictionary
  FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
			      MAGIC_STEM_BUILD, MG_ABORT);
  invf_dict_header idh;
  idh.Read (dictFile);

  // open the inverted file
  FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb",
			MAGIC_INVF, MG_ABORT);
  invf_file_header ifh;
  ifh.Read (invfFile);
  if (ifh.skip_mode != SKIP_MODE_NO_SKIPS)
    FatalError (1, "The invf file contains skips. Unable to create weights.");

  // open the inverted index file
  FILE *invfIdxFile = open_file (filename, INVF_IDX_SUFFIX, "rb",
				 MAGIC_INVI, MG_ABORT);
  
  // read the level information
  FILE *levelFile = open_file (filename, INVF_LEVEL_SUFFIX, "rb",
			       MAGIC_INVF_LEVELS, MG_ABORT);
  FIvfLevel ivfLevel;
  ivfLevel.Read (levelFile);
  fclose (levelFile);

  // read in the tag dictionary and inverted file pointers
  WBTagDict tagDict;
  ReadTagDict (idh, dictFile, invfIdxFile, tagDict);

  
  // create the weights file
  FILE *weightsFile = create_file (filename, WEIGHTS_SUFFIX, "wb",
				   MAGIC_WGHT, MG_ABORT);

  // create the approx weights file
  FILE *approxWeightsFile = create_file (filename, APPROX_WEIGHTS_SUFFIX, "wb",
					 MAGIC_WGHT_APPROX, MG_ABORT);

  
  // create weights for each document level
  FragLevelConvert fragLevelConvert;
  Weights w;
  IvfLevelInfoMap::iterator levelHere = ivfLevel.levelInfo.begin();
  IvfLevelInfoMap::iterator levelEnd = ivfLevel.levelInfo.end();
  mg_u_long levelNum = 0;
  while (levelHere != levelEnd) {
    const UCArray &levelName = (*levelHere).first;
    
    // read the tag information about this level
    fragLevelConvert.Read(invfFile, tagDict[levelName].tagPtr,
			  idh.num_frags, tagDict[levelName].fragOccur);
    
    // create the weights for this level
    GenerateLevelWeights (idh, ifh, tagDict[levelName].fragOccur,
			  levelNum, dictFile, invfFile, invfIdxFile,
			  fragLevelConvert, w);

    // write out the exact weights
    WriteExactWeights (weightsFile,
		       (*levelHere).second.exactWeightsDiskPtr,
		       w);
    
    // write out the approximate weights
    WriteApproxWeights (approxWeightsFile,
			(*levelHere).second.approxWeightsDiskPtr,
			w, bits);
    
    ++levelHere; ++levelNum;
  }
  

  // close input files
  fclose (dictFile);
  fclose (invfFile);
  fclose (invfIdxFile);

  // update the level information
  levelFile = create_file (filename, INVF_LEVEL_SUFFIX, "wb",
			   MAGIC_INVF_LEVELS, MG_ABORT);
  ivfLevel.Write (levelFile);
  fclose (levelFile);

  // close output files
  fclose (weightsFile);
  fclose (approxWeightsFile);


  return 0;
}
