# -*- coding: latin-1 -*-

from glob import glob
from collections import defaultdict
import codecs
import sys
import re
import os
import datetime

import htmlentitydefs

from nltk.tokenize import wordpunct_tokenize
from nltk.util import clean_html

parenthetical = re.compile("\\([^\\)]*\\)")
date_lead = re.compile(".*(am|publiziert)")
date_dividers = re.compile("[\\./]")
not_numbers = re.compile("[^0-9]*")

months = {}
months["Januar"] = 1
months["Februar"] = 2
months["Mrz"] = 3
months[u'M\xe4rz'] = 3
months["April"] = 4
months["April/Mai"] = 4
months["Mai"] = 5
months["Juni"] = 6
months["Juli"] = 7
months["August"] = 8
months["September"] = 9
months["Oktober"] = 10
months["November"] = 11
months["Dezember"] = 12

for ii in xrange(13):
   months["%i" % ii] = ii
   months["%i." % ii] = ii

date_count = defaultdict(int)

# Date of first motion picture
reference_date = datetime.datetime(1878, 6, 19)

def german_unicode_to_ascii(german):
    german = german.replace(u'', u'ue')
    german = german.replace(u'', u'oe')
    german = german.replace(u'', u'ae')
    german = german.replace(u'', u'ss')
    return german.encode("ascii", "ignore")

def convert_html_entities(s):
    matches = re.findall("&#\d+;", s)
    if len(matches) > 0:
        hits = set(matches)
        for hit in hits:
                name = hit[2:-1]
                try:
                        entnum = int(name)
                        s = s.replace(hit, unichr(entnum))
                except ValueError:
                        pass

    matches = re.findall("&\w+;", s)
    hits = set(matches)
    amp = "&amp;"
    if amp in hits:
        hits.remove(amp)
    for hit in hits:
        name = hit[1:-1]
        if htmlentitydefs.name2codepoint.has_key(name):
                s = s.replace(hit, unichr(htmlentitydefs.name2codepoint[name]))
    s = s.replace(amp, "&")
    return s

splits = {"intro": ("<!-- ----------BEGIN INTRO -------------- -->",
                    "<!-- ----------END INTRO ----------------- -->"),
          "film": ("<!-- -----------BEGIN FILMTITEL ---------- -->",
                   "<!-- -----------END FILMTITEL ------------- -->"),
          "date": ("<!-- --------BEGIN DATUM--------------- -->",
                   "<!-- --------END DATUM-----------------  -->"),
          "review": ("<!-- ---------BEGIN REZENSION---------- -->",
                     "<!-- ---------END REZENSION --------------->"),
          "author": ("<!-- ---------BEGIN AUTORENNAME ------- -->",
                     "<!-- ---------END AUTORENNAME---------- -->"),
          "rating": ("<!-- ----------BEGIN WERTUNG-------------- -->",
                     "<!-- ----------END WERTUNG ------------------>")}

class Review:
  def __init__(self, filename, text):
   self.fields = {}
   self.filename = filename

   for ii in splits:
       start, stop = splits[ii]

       assert start in text
       assert stop in text
       
       self.fields[ii] = convert_html_entities(clean_html(text.split(start,1)[1].split(stop,1)[0]))

  def __getitem__(self, item):
    return self.fields[item]

  def time(self):
    """
    Returns number of hours since midnight of the day Mubridge created first
    motion picture (a reasonable lower bound for a film review date)
    """
    date_string = self["date"]
    date_string = parenthetical.sub("", date_string)
    date_string = date_lead.sub("", date_string).strip()

    print date_string
    if " " in date_string or "\n" in date_string:
      date_fields = date_string.split()
      print date_fields
      print self.filename

      if len(date_fields) == 2:
        month, year = date_fields
        day = 15
      else:
        day, month, year = date_fields

        if day.endswith("."):
          day = int(day[:-1])
        else:
          day = int(day)

      month = months[month]
      year = int(year)
    else:
      print self.filename
      day, month, year = map(int, date_dividers.split(date_string))

    if year < 30:
      year += 2000
    if year < 100:
      year += 1900



    hours = (datetime.datetime(year, month, day) - reference_date).days * 24
    assert hours > 0, "%i %i %i" % (year, month, day)
    return hours

  def words(self):
      for ii in wordpunct_tokenize(self["intro"]):
          yield ii
      for jj in wordpunct_tokenize(self["review"]):
          yield jj

  def author(self):
      return "+".join(x for x in german_unicode_to_ascii(self["author"]).split() if len(x) > 1)

  def text(self):
      return " ".join(x for x in self.words())

  def rating(self):
    raw_rating = self["rating"].lower()
    num, den = [float(not_numbers.sub("", x)) for x in raw_rating.split("von")]
    assert num <= den
    if den == 5.0:
      return num / den
    else:
      # Pang Lee convention for converting scores
      assert den == 4.0
      return 0.1 + (num / den) * 0.8

  def print_fields(self):
    for ii in splits:
      print("%s:%s\n----------\n" % (ii, self[ii].encode("ascii", 'ignore')))

def write_pang_lee_format(reviews, dir):
  for ii in reviews:
    print("|%s|" % ii)
    try:
        os.mkdir("%s/%s" % (dir, ii))
    except OSError:
        None
    text_out = codecs.open("%s/%s/subj.%s" % (dir, ii, ii), 'w', 'utf-8')
    rating_out = codecs.open("%s/%s/rating.%s" % (dir, ii, ii), 'w', 'utf-8')
    id_out = codecs.open("%s/%s/id.%s" % (dir, ii, ii), 'w', 'utf-8')

    for jj in reviews[ii]:
      id = jj.time()
      text_out.write("%s\n" % jj.text())
      rating_out.write("%f\n" % jj.rating())
      id_out.write("%i\n" % (id + date_count[id]))
      date_count[id] += 1
    text_out.close()
    rating_out.close()
    id_out.close()


if __name__ == "__main__":
  reviews = defaultdict(set)
  for ii in glob(sys.argv[1]):
    print("Parsing %s" % ii)
    text = codecs.open(ii, 'r', 'latin-1').read()
    if "-BEGIN WERTUNG-" in text:
      r = Review(ii, text)
      reviews[r.author()].add(r)
  for ii in reviews:
    print("_____________________ %s ___________________" % ii.encode("ascii", 'ignore'))
    for jj in reviews[ii]:
      jj.print_fields()
      print("*****  %f  ****" % jj.rating())


  write_pang_lee_format(reviews, sys.argv[2])
