#!/usr/bin/python
# Joe Presbrey <presbrey@mit.edu>
#
# Dependencies:
# * rdfxml: <http://infomesh.net/2003/rdfparser/>
import rdfxml

import sys
from time import sleep
from threading import Thread, RLock

import urllib2, htmllib
from HTMLParser import HTMLParser


### RDF/XML macros

isBlank = lambda x: isinstance(x, basestring) and len(x)>2 and x[0:2] == u'_:'
isSymbol = lambda x: isinstance(x, basestring) and len(x)>1 and x[0] == u'<' and x[-1] == u'>'
Symbol = lambda x: u'<' + unicode(x) + u'>'
Statement = lambda s,p,o,w: {
    'subj': unicode(s), 'pred': unicode(p),
    'obj': unicode(o), 'why': isSymbol(w) and unicode(w) or Symbol(w) }
Namespace = lambda uri: lambda x: Symbol(uri + x)

foaf = Namespace(u'http://xmlns.com/foaf/0.1/')
schema = Namespace(u'http://www.w3.org/2000/01/rdf-schema#')

unwrap = lambda x: isSymbol(x) and x[1:-1] or x
unfrag = lambda x: '#' in x and (x[:x.find('#')==-1 and len(x) or x.find('#')] + (x[0]=='<' and '>' or '')) or x
frag = lambda x: x[x.find('#')==-1 and len(x) or x.find('#'):len(x)-(x[-1]=='>')]
cpfrag = lambda x,y: unfrag(y)[-1] == '>' and unfrag(y)[:-1]+frag(x)+'>' or unfrag(y)+frag(x)


### rdfxml helper

def parseURI(uri, sink):
    uri = unwrap(unfrag(uri))
    req = urllib2.Request(uri, None, {'Accept': 'application/rdf+xml'})
    r = urllib2.urlopen(req)
    sink.set_why(r.geturl())
    if hasattr(rdfxml, 'parse'):
        rdfxml.parse(r.geturl(), r, callback=sink)
    else:
        d = r.read()
        rdfxml.parseRDF(d, base=r.geturl(), sink=sink)
    return r.geturl()


### supporting classes

class Store(object):
    "basic triple store"

    def __init__(self):
        self._statements = []

    def __iter__(self):
        for x in self._statements: yield x

    def __str__(self):
        return '\n'.join(map(str, self._statements))

    def add(self, s, p, o, w):
        w = not isSymbol(w) and Symbol(unfrag(w)) or unfrag(w)
        self._statements.append(Statement(s, p, o, w))

    def addStatement(self, statement):
        if not statement is None:
            self._statements.append(statement)

    def statementsMatching(self, s=None, p=None, o=None, w=None):
        return filter(lambda x: (s is None or x['subj'].upper() == s.upper()) and \
                                (p is None or x['pred'].upper() == p.upper()) and \
                                (o is None or x['obj'].upper() == o.upper()) and \
                                (w is None or x['why'].upper() == w.upper()), self._statements)

    def the(self, s=None, p=None, o=None, w=None):
        r = self.statementsMatching(s, p, o, w)
        if len(r) == 1: return r[0]
        return Statement(None,None,None,None)

    class _Sink(object):
        def __init__(self, store, why): (self._store, self._why) = (store, why)
        def set_why(self, why): self._why = why
        def triple(self, s, p, o): self._store.add(s, p, o, self._why)
        def __call__(self, s, p, o): self.triple(s, p, o)

    def sink(self, why=None): return self._Sink(self, why)

class TagParser(HTMLParser):
    def __init__(self, data, constraint=None):
        HTMLParser.__init__(self)
        self._constraint = constraint
        self._entities = {}
        self.feed(data)
        self.close()
    def handle_startendtag(self, tag, attrs):
        attrs = dict(attrs)
        if not callable(self._constraint) \
           or self._constraint(tag, attrs):
            if not tag in self._entities:
                self._entities[tag] = []
            self._entities[tag].append(attrs)
    def __getitem__(self, k):
        return self._entities.get(k)


### the Voyager

# TagParser constraint
_filter_link_openid_delgate = lambda tag, attr: tag == 'link' \
        and not attr.get('href') is None \
        and ((attr.get('rel') == 'openid.delegate' or attr.get('rel') == 'openid.server'))

class Walk(object):
    _lock_parse = RLock()

    def __init__(self, home='http://dig.csail.mit.edu/data#DIG'):
        self._home = isSymbol(home) and home or Symbol(home)
        self._store = Store()
        (self._walk, self._uri_map, self._walked) = ([], {}, {})
        self._walk_i = 0

    def _preload(self):
        "load the base tree"
        uri1 = cpfrag(self._home, parseURI(self._home, self._store.sink()))
        self.uri_map(self._home, uri1)
        self.set_walked(uri1)
        self._walk.append([x['obj'] for x in self._store.statementsMatching(s=Symbol(uri1), p=foaf('member'), w=Symbol(unfrag(uri1)))])

    def uri_map(self, src, dst):
        "map rewritten URIs"
        src = unwrap(unfrag(src))
        dst = unwrap(unfrag(dst))
        if len(src) > 0 and len(dst) > 0 and src != dst:
            self._uri_map[src] = dst

    def uri_resolve(self, uri):
        "resolve rewritten URIs"
        uri1 = unwrap(unfrag(uri))
        if uri1 in self._uri_map:
            if isSymbol(uri):
                return Symbol(cpfrag(uri, self._uri_map[uri1]))
            else:
                return cpfrag(uri, self._uri_map[uri1])
        return uri

    def set_walked(self, uri):
        "mark a uri as walked"
        uri = unwrap(unfrag(uri))
        self._walked[uri] = 1

    def is_walked(self, uri):
        "check if a uri has already been walked"
        uri = self.uri_resolve(unwrap(unfrag(uri)))
        return uri in self._walked and self._walked[uri] == 1

    def _this_iter(self):
        return self._walk_i

    def _this_walk(self):
        return self._walk[self._walk_i]

    def _next_walk(self):
        w = self._this_walk()
        self._walk.append([])
        self._walk_i = len(self._walk)-1
        return w

    def _queue_walk(self, uri):
        "queue given uris for walking on next iteration"
        if type(uri) is list:
            for x in uri:
                self._queue_walk(x)
        elif not self.is_walked(uri):
            if self._walk_i == 0 or \
              (not uri in self._walk[self._walk_i-1]
              and not uri in self._walk[self._walk_i]):
                #sys.stderr.write("[%d] queueing %s\n" % (self._walk_i, uri.encode('ascii','ignore')))
                self._walk[self._walk_i].append(uri)

    def _run(self):
        "run a single iteration"
        c = 0
        children = self._next_walk()
        threads = []

        for subj in children:
            "spawn a thread for each child URI at this iteration"
            c += 1
            t = Thread(target=self._parse, kwargs={'subj':subj})
            threads.append(t)
            t.start()
            #if c % 8 == 0:
            #    sleep(.5)

        while 1:
            "wait for all child threads to finish"
            threads = filter(lambda x: x.isAlive(), threads)
            if len(threads) == 0:
                break
            threads[0].join()

    def _parse(self, subj):
        next2 = []
        store = Store()
        #sys.stderr.write("parsing %s\n" % subj.encode('ascii','ignore'))
        try:
            "copy fragment from old URI to actual URI"
            uri1 = cpfrag(subj, parseURI(subj, store.sink()))
        except:
            self.set_walked(subj)
            return

        self._lock_parse.acquire()
        self.uri_map(subj, uri1)
        self.set_walked(uri1)
        subj1 = Symbol(uri1)

        if len(store.statementsMatching(subj1, foaf('name'))) == 0:
            "catch FOAF files with invalid about/primaryTopic"
            subj1 = None

        openid = [x['obj'] for x in store.statementsMatching(subj1, p=foaf('openid'))]
        name = store.the(subj1, foaf('name'), None)
        homepage = store.the(subj1, foaf('homepage'), None)
        if len(openid) == 0 and not homepage['obj'] is None:
            "check homepage for an openid if missing from foaf"
            try:
                tp = TagParser(urllib2.urlopen(unfrag(homepage['obj'])).read(), _filter_link_openid_delgate)
                if not tp['link'] is None and len(tp['link']):
                    openid = [unfrag(homepage['obj'])]
            except: pass

        if len(openid) > 0 or self._this_iter() <= 1:
            "follow foaf:knows for people with openid (or very close to top)"

            "save some metadata about the person"
            self._store.addStatement(name)
            self._store.addStatement(homepage)
            for obj in openid:
                "save openids to main store"
                self._store.add(subj1, foaf('openid'), obj, uri1)

            knows = [y['obj'] for y in store.statementsMatching(subj1, p=foaf('knows'))]
            #sys.stderr.write("%s has openid, following %s foaf:knows\n" % (Symbol(uri1).encode('ascii','ignore'), len(knows)))
            for who in knows:
                if not isBlank(who):
                    next2.append(who)
                knows_also = [z['obj'] for z in store.statementsMatching(s=who, p=schema('seeAlso'))]
                if (len(knows_also)):
                    next2.extend(knows_also)

        if len(next2) > 0:
            self._queue_walk(next2)
        self._lock_parse.release()

    def walk(self, num_iter=100):
        "walk foaf:knows from home"

        self._preload()
        for i in xrange(num_iter):
            sys.stderr.write("[ Iteration: %d | URIs: %d ]\n" % (i, len(self._this_walk())))
            self._run()
            if len(self._this_walk()) == 0:
                break

        openids = map(lambda x: unwrap(x['obj']), self._store.statementsMatching(p=foaf('openid')))
        while len(openids) > 0:
            y = openids.pop(0)
            if openids.count(y) == 0:
                "only print unique openids"
                print y


if __name__ == '__main__':
    import socket
    socket.setdefaulttimeout(8)
    if len(sys.argv) > 1:
        Walk().walk(int(sys.argv[1]))
    else:
        Walk().walk()
