# -*- coding: utf-8 -*- """ aptdata -- get airport data from wikipedia Usage ----- Try:: python aptdata.py contacts.kid AUS To use an httplib2_ cache in the "wikipedia-cache" directory use:: python aptdata.py --cache contacts.kid AUS .. _httplib2: http://bitworking.org/projects/httplib2/ Colophon -------- This module is documented in rst_ format for use with epydoc_. .. _epydoc: http://epydoc.sourceforge.net/ .. _rst: http://docutils.sourceforge.net/docs/user/rst/quickstart.html The examples in the docstrings below are executable doctest_ unit tests. Check them a la:: $ python aptdata.py --test .. _doctest: http://www.python.org/doc/lib/module-doctest.html """ __docformat__ = "restructuredtext en" import logging import re import urllib2 import kid # http://lesscode.org/projects/kid/ def airportCard(web, iata, card = {}): """Get hCard info about an airport from Wikipedia :param iata: IATA code of the relevant aiport :param web: function that takes a URI and returns info, bytes :param card: a JSON style dict. modified in place """ pg, name = airportArticle(web, iata) card['name'] = {'text': name } card['org'] = {'organization-name': name} card['fn'] = {'text': name} card['nickname'] = {'text': iata} if not pg.endswith("action=edit"): # hmm... '_' or 'text' ? escaping? card['url'] = {'_': pg } # hmm... use an IFP, e.g. foaf:homepage, too try: lat, long, txt = latlong(web, pg) except ValueError: #_progress("can't find coords in ", pg) pass else: geo = {'latitude': lat, 'longitude': long} card['geo'] = geo try: cityPath, cityName = findServes(txt) except ValueError: pass else: cityURL = 'http://en.wikipedia.org/wiki' + cityPath if ',' in cityName: localityName, regionName = cityName.split(',') regionName = regionName.strip() adr = {'locality': localityName, 'region': regionName } else: adr = {'locality': cityName} # hmm... is the city served by an airport # "another person who will act on behalf of the # individual or resource associated with the vCard"? cityadr = {} cityadr.update(adr) card['agent'] = {'org' : {'organization-name': cityName }, 'fn' : {'text': cityName}, 'url': {'_': cityURL}, 'adr': cityadr } if 'adr' in card: card['adr'].update(adr) else: card['adr'] = adr return card def airportArticle(web, iata): listpg = idxpg(iata) _progress("getting list of airports containing ", iata, " in ", listpg) info, content = web(listpg) txt = content.decode('utf-8') path, name = findlink(txt, iata) return Site + path, name def latlong(web, pg): #_progress("finding data in", pg) info, bytes = web(pg) txt = bytes.decode('utf-8') lat, long = findcoords(txt) return lat, long, txt def dms(o, d, m, s): """ >>> abs(dms(u'N', 30, 11, u'40.3') - 30.194527777777779) <.001 True """ return (o in ('N', 'E') and 1 or -1) * (d + \ (m + float(s)/60)/60) def idxpg(iata): """ >>> idxpg("LGA") 'http://en.wikipedia.org/wiki/List_of_airports_by_IATA_code:_L' """ return Site + '/wiki/List_of_airports_by_IATA_code:_' + iata[0] _TestBOS="""
  • BOS (KBOS) \u2013 Logan International Airport \u2013 Boston, Massachusetts, United States
  • """ _TestLGA="""
  • LGA (KLGA) Ð LaGuardia Airport Ð Flushing, New York (New York City), United States
  • """ _TestAUS = """
  • AUS (KAUS) Ð Austin-Bergstrom International Airport Ð Austin, Texas, United States
  • """ def findlink(txt, iata): """ >>> findlink(_TestBOS, "BOS") ('/wiki/Logan_International_Airport', 'Logan International Airport') >>> findlink(_TestLGA, "LGA") ('/wiki/LaGuardia_Airport', 'LaGuardia Airport') >>> findlink(_TestAUS, "AUS") ('/wiki/Austin-Bergstrom_International_Airport', 'Austin-Bergstrom International Airport') """ m = re.search(r'
  • <\w+[^>]*>%s[^\)]+\)[^<]*]+>' '([^<]+)Coordinates 40¡ 46' 38.07" N
    73¡ 52' 21.39" W
    """ # ' help emacs _TestData3 = u""" Coordinates 39¡ 17' 51" N

    94¡ 42' 50" W

    """ # ' help emacs _TestData4 = u""" Coordinates 45¡40Œ47N, 74¡02Œ19W """ _TestData5 = u""" Coordinates 51°28\u203239\u2033N, 0°27\u203241\u2033W """ def findcoords(txt): """ >>> findcoords(_TestData2) (40.777241666666669, -73.872608333333332) >>> findcoords(_TestData3) (39.297499999999999, -94.713888888888889) >>> findcoords(_TestData4) (45.679722222222225, -74.038611111111109) >>> findcoords(_TestData5) (51.477499999999999, -0.4613888888888889) >>> findcoords("http://tools.wikimedia.de/~magnus/geo/geohack.php?params=32_53_49_N_97_02_17_W_type:airport") (32.896944444444443, -97.038055555555559) >>> findcoords("http://tools.wikimedia.de/~magnus/geo/geohack.php?params=33_56_54.94_N_83_19_34.84_W_type:airport") (33.948594444444446, -83.326344444444445) >>> findcoords("http://tools.wikimedia.de/~magnus/geo/geohack.php?params=39.2975_N_94.7138888889_W_") (39.297499999999999, -94.713888888900001) """ m = re.search(r'\.php\?params=' r'([\d]+)_([\d]+)_([\d\.]+)_([NS])' r'_([\d]+)_([\d]+)_([\d\.]+)_([EW])_type:airport', txt) if m: return dms(m.group(4), int(m.group(1)), int(m.group(2)), m.group(3)),\ dms(m.group(8), int(m.group(5)), int(m.group(6)), m.group(7)) m = re.search(r'\.php\?params=' r'([\d\.]+)_([NS])_' r'([\d\.]+)_([EW])_', txt) if m: o = m.group(2) lat = float(m.group(1)) * (o == 'N' and 1 or -1) o = m.group(4) lon = float(m.group(3)) * (o == 'N' and 1 or -1) return lat, lon m = re.search(r'>Coordinates\s*]*>' r'(?:]+>)?' r'(?:]*>)?' r'(\d+)\D*(\d+)\D*([\d\.]+)..([NS])' r'(?:<[^>]*>)*' r'[, \n]*' r'(?:<[^>]*>)*' r'(\d+)\D*(\d+)\D*([\d\.]+)..([EW])', txt) if m: return dms(m.group(4), int(m.group(1)), int(m.group(2)), m.group(3)),\ dms(m.group(8), int(m.group(5)), int(m.group(6)), m.group(7)) raise ValueError, "coordinates not found" _TestServes=""" Serves Baltimore, Maryland """ _Spat = re.compile(r'>Serves<') _SLpat = re.compile(r']*>([^<]+)<') def findServes(txt): """Find link ref and name of city served by this airport >>> findServes(_TestServes) ('/wiki/Baltimore%2C_Maryland', 'Baltimore, Maryland') """ m = _Spat.search(txt) if not m: raise ValueError m2 = _SLpat.search(txt, m.end(0)) if not m2: raise ValueError return m2.group(1), m2.group(2) Site = 'http://en.wikipedia.org' UserAgent = 'aptdata.py/200604' class WebCache(object): def __init__(self, dirname): import httplib2 self._h = httplib2.Http(dirname) #hmm... 'max-age=3600' doesn't seem to help, given wikipedia's #cache-control: private, s-maxage=0, max-age=0, must-revalidate #only-if-cached works in a pinch hdrs = {'User-agent': UserAgent, 'cache-control': 'max-stale=3600' } def get(self, addr): resp, content = self._h.request(addr, headers= self.hdrs) return resp, content _Airports = """AHO BOS CDG DCA DFW EWR FCO LHR MCI NCE ORD PIT SFO STL YMX YVR BRS DTW AMS GLA YYZ MAN SNA MIA ZRH BUD CMH IAD NRT MSP MEM BRU ATL HEL MKE EDI""".split() class batteries_included_get(object): """urlopener suitable for use with wikipedia Umm... wikipedia seems to forbid use of default urllib UA string. cf http://en.wikipedia.org/wiki/User:Skagedal/Fafafa/Code """ def __init__(self): self._opener = urllib2.build_opener() self._opener.addheaders = [('User-agent', UserAgent)] def __call__(self, addr): fp = self._opener.open(addr) content = fp.read() return fp, content def _testonline(): for iata in _Airports: print airportCard(batteries_included_get(), iata) def _test(): import doctest doctest.testmod() def _progress(*args): import sys for a in args: sys.stderr.write('%s ' % a) sys.stderr.write("\n") import BaseHTTPServer class WikiAirportServer(BaseHTTPServer.HTTPServer): def __init__(self, addr, handlerClass, web, template): BaseHTTPServer.HTTPServer.__init__(self, addr, handlerClass) self.web = web self.template = template class WikiAirportHandler(BaseHTTPServer.BaseHTTPRequestHandler): def do_GET(self): if self.path.startswith("/apt/"): iata = self.path[len("/apt/"):] try: txts = self.airportPage(iata) except IndexError: self.notFound() return self.send_response(200) # hmm... content type from cmd line, along with template? self.send_header("Content-type", "application/rdf+xml") self.end_headers() for txt in txts: self.wfile.write(txt) else: self.notFound() def airportPage(self, iata): """raises IndexError in the case of no such airport""" s = self.server template = s.template template.contacts = [airportCard(s.web, iata)] template.path = '' return template.generate(output='xml', encoding='utf-8') def notFound(self): """Report an HTTP 404 'Not Found' error and give a link to something to try. """ s = self.server self.send_response(404) self.send_header("Content-type", "text/html") self.end_headers() self.wfile.write(""" DBViewHandler: 404: Not Found

    Not Found (404)

    Try something like LGA or pick some other airport code.

    """) def serve(): import sys import httplib2 port = 8123 access = WebCache("wikipedia-cache").get kidfn = sys.argv[1] # raises KeyError template=kid.Template(file=kidfn) s = WikiAirportServer(('', port), WikiAirportHandler, access, template) print 'Wiki airport data server running on port: ', port s.serve_forever() def main(argv): if '--cache' in argv: import httplib2 #@@ kludge.. hardcoded filename access = WebCache("wikipedia-cache").get del argv[argv.index("--cache")] else: access = batteries_included_get() kidfn = argv[1] # raises KeyError template=kid.Template(file=kidfn) template.contacts = [airportCard(access, code) for code in argv[2:]] template.path = '' for txt in template.generate(output='xml', encoding='utf-8'): sys.stdout.write(txt) #Airports in Afghanistan AptsIn_pat = r'Airports in ([^<]+)' def _testcountries(): import sys txt = sys.stdin.read() pat = re.compile(AptsIn_pat) delta = 0 while delta < len(txt): m = pat.search(txt, delta) if not m: break delta = m.end(0) path, name = m.group(1), m.group(2) print "@@found: ", path, name if __name__ == '__main__': import sys if '--test' in sys.argv: _test() elif '--test2' in sys.argv: _testonline() elif '--testC' in sys.argv: _testcountries() elif '--serve' in sys.argv: serve() elif '--json' in sys.argv: import pprint for code in sys.argv[2:]: print pprint.pprint(airportCard(batteries_included_get(), code)) else: try: main(sys.argv) except IndexError, KeyError: print >>sys.stderr, __doc__ sys.exit(2)