#!/usr/bin/env python
# -*- coding: utf-8 -*-

import urllib2
import urllib
import tempfile
import os
import os.path
import time
import xml.dom.minidom
from subprocess import Popen, PIPE
import sys

TABULATOR_KB_FOLDER = "/afs/csail.mit.edu/group/dig/www/data/2005/ajar/ajaw/js/"
TABULATOR_KB_FILES = ["uri.js", "rdf/term.js", "rdf/match.js", "rdf/identity.js", "rdf/serialize.js"]
TABULATOR_FILES_ABSOLUTE = map(lambda x: TABULATOR_KB_FOLDER + x,
                                         TABULATOR_KB_FILES)
CWM_EXECUTABLE = "/afs/csail.mit.edu/group/dig/www/data/TAMI/2007/cwmrete/cwm"
#We should not need this once serialize.js can do XML-tag for rdf:type
JQUERY_FILE = "jquery-1.3.2-mod.min.js"
__folder__ = os.getcwd()
if __name__ != '__main__':
    __folder__ = os.path.dirname(__file__) #path to this file

def run_jsGRDDL(source, script, format="XML", mirror="", location=None,
                enableJQ=False):
    ##the temporary js script generated with the following code
    setup_js_tempfile = tempfile.NamedTemporaryFile(mode='w', bufsize=0)
    if not ":" in source: #local file
        if source[0] == '/':
            source = "file://" + source
        else:
            source = "file://" + os.getcwd() + "/" + source

    ##We build the xml_minidom tree here, for general XML we first do
    dom = None

    #document = urllib2.urlopen(source)
    #There was a trap in the following code. I used to use 
    #xml.dom.minidom.parse(document). But the error handling code would get
    #the document that's already read. Be aware!!
    document_string = urllib2.urlopen(source).read()
    try:
        dom = xml.dom.minidom.parseString(document_string)
    except xml.parsers.expat.ExpatError:
        import html5lib
        from html5lib import treebuilders
        parser = html5lib.HTMLParser(tree=treebuilders.
                                     getTreeBuilder("dom"))
        dom = parser.parse(document_string)

    #@? do asynchronous loading of tabulator library?
    ##The following code generated JavaScript for initializing the browser 
    ##emulator see the template <init_env.js> document, window, knowledgeBase 
    ##are  defined
    #Beware of the paths flow here
    if location: source = location #overides source, used when this function
                                   #is called with local source but the source
                                   #does have a HTTP URI.
    scrape_js_tempfile_filename, _ = urllib.urlretrieve(script)
    scrape_js_tempfile_filename = os.path.abspath(scrape_js_tempfile_filename)
    
    c_escaped_xmlstring = repr(dom.documentElement.toxml())[1:] #remove u''
    setup_script = (open(os.path.join(__folder__, 'init_env.js')).read() % 
                        (repr(source), repr(mirror), c_escaped_xmlstring,
                         repr(format)))
    setup_js_tempfile.write(setup_script.encode('utf-8'))


    #@@rescript to 5s of execution time
    old_dir = os.getcwd()
    os.chdir(__folder__)
    script_para = []
    for script_file_name in (["nullify_tab.js"] + TABULATOR_FILES_ABSOLUTE +
                             ["E4X2DOM.js", setup_js_tempfile.name]):
        script_para.append("-f")
        script_para.append(script_file_name)
    if enableJQ:
        script_para.append("-f")
        script_para.append(JQUERY_FILE)
    for script_file_name in ([scrape_js_tempfile_filename, 
                              "term_env.js"]): #this serializes the store
        script_para.append("-f")
        script_para.append(script_file_name)

    js_shell = Popen(["./js", "-x"] + script_para, stdout=PIPE, stderr=PIPE)
    output = js_shell.stdout #@@start hacking -- the output stream
    if format=="XML":
        cwm = Popen([CWM_EXECUTABLE, "-rdf=d"], stdin=output, stdout=PIPE)
        output = cwm.stdout
    #end hacking
    js_error = js_shell.stderr.read() #This waits util all js codes are 
                                      #executed.
    if js_error:
        raise OSError("Error occured during JavaScript execution. SpiderMonkey returned the following message:\n" + js_error)

    js_shell.wait() #So this might not be needed.
    os.chdir(old_dir)
    setup_js_tempfile.close()
    return output.read()

if __name__ == '__main__':    
    import cgi
    import cgitb; cgitb.enable()
    import sys
    import os

    env = os.environ
    fields = cgi.FieldStorage()
    source = fields['source'].value
    script = None
    enableJQ = False #whether to enable jQuery in the js environment
    mirror = "" # a site that mirrors the source. 
                # <source> cc:derivativeWork <mirror>.
    #referer = "" #I thought that a redirected request would contain the 
                  # referer: <redirecting resource> header, but it's not the case
    if 'script' in fields:
        script = fields['script'].value
    if 'enableJQ' in fields:
        enableJQ = True
    if 'mirror' in fields:
        mirror = fields['mirror'].value
    try:
        result = run_jsGRDDL(source, script, 
                             format="N3", enableJQ=enableJQ, mirror=mirror)
        print "Content-Type: text/rdf+n3"
        print
        print result
    except:
        print "Status: 500 Internal Server Error"
        print "Content-Type: text/html"
        print

        raise #Let cgitb display colorful error message