# -*- coding: utf-8 -*-
"""
aptdata -- get airport data from wikipedia
Usage
-----
Try::
python aptdata.py contacts.kid AUS
To use an httplib2_ cache in the "wikipedia-cache" directory use::
python aptdata.py --cache contacts.kid AUS
.. _httplib2: http://bitworking.org/projects/httplib2/
Colophon
--------
This module is documented in rst_ format for use with epydoc_.
.. _epydoc: http://epydoc.sourceforge.net/
.. _rst: http://docutils.sourceforge.net/docs/user/rst/quickstart.html
The examples in the docstrings below are executable doctest_ unit
tests. Check them a la::
$ python aptdata.py --test
.. _doctest: http://www.python.org/doc/lib/module-doctest.html
"""
__docformat__ = "restructuredtext en"
import logging
import re
import urllib2
import kid # http://lesscode.org/projects/kid/
def airportCard(web, iata, card = {}):
"""Get hCard info about an airport from Wikipedia
:param iata: IATA code of the relevant aiport
:param web: function that takes a URI and returns info, bytes
:param card: a JSON style dict. modified in place
"""
pg, name = airportArticle(web, iata)
card['name'] = {'text': name }
card['org'] = {'organization-name': name}
card['fn'] = {'text': name}
card['nickname'] = {'text': iata}
if not pg.endswith("action=edit"):
# hmm... '_' or 'text' ? escaping?
card['url'] = {'_': pg } # hmm... use an IFP, e.g. foaf:homepage, too
try:
lat, long, txt = latlong(web, pg)
except ValueError:
#_progress("can't find coords in ", pg)
pass
else:
geo = {'latitude': lat, 'longitude': long}
card['geo'] = geo
try:
cityPath, cityName = findServes(txt)
except ValueError:
pass
else:
cityURL = 'http://en.wikipedia.org/wiki' + cityPath
if ',' in cityName:
localityName, regionName = cityName.split(',')
regionName = regionName.strip()
adr = {'locality': localityName, 'region': regionName }
else:
adr = {'locality': cityName}
# hmm... is the city served by an airport
# "another person who will act on behalf of the
# individual or resource associated with the vCard"?
cityadr = {}
cityadr.update(adr)
card['agent'] = {'org' : {'organization-name': cityName },
'fn' : {'text': cityName},
'url': {'_': cityURL},
'adr': cityadr
}
if 'adr' in card:
card['adr'].update(adr)
else:
card['adr'] = adr
return card
def airportArticle(web, iata):
listpg = idxpg(iata)
_progress("getting list of airports containing ", iata, " in ", listpg)
info, content = web(listpg)
txt = content.decode('utf-8')
path, name = findlink(txt, iata)
return Site + path, name
def latlong(web, pg):
#_progress("finding data in", pg)
info, bytes = web(pg)
txt = bytes.decode('utf-8')
lat, long = findcoords(txt)
return lat, long, txt
def dms(o, d, m, s):
"""
>>> abs(dms(u'N', 30, 11, u'40.3') - 30.194527777777779) <.001
True
"""
return (o in ('N', 'E') and 1 or -1) * (d + \
(m + float(s)/60)/60)
def idxpg(iata):
"""
>>> idxpg("LGA")
'http://en.wikipedia.org/wiki/List_of_airports_by_IATA_code:_L'
"""
return Site + '/wiki/List_of_airports_by_IATA_code:_' + iata[0]
_TestBOS="""
BOS (KBOS) \u2013 Logan International Airport \u2013 Boston, Massachusetts, United States
"""
_TestLGA="""
LGA (KLGA) Ð LaGuardia Airport Ð Flushing, New York (New York City), United States
"""
_TestAUS = """
AUS (KAUS) Ð Austin-Bergstrom International Airport Ð Austin, Texas, United States
"""
def findlink(txt, iata):
"""
>>> findlink(_TestBOS, "BOS")
('/wiki/Logan_International_Airport', 'Logan International Airport')
>>> findlink(_TestLGA, "LGA")
('/wiki/LaGuardia_Airport', 'LaGuardia Airport')
>>> findlink(_TestAUS, "AUS")
('/wiki/Austin-Bergstrom_International_Airport', 'Austin-Bergstrom International Airport')
"""
m = re.search(r'<\w+[^>]*>%s\w+>[^\)]+\)[^<]*]+>'
'([^<]+)'
% iata, txt)
if not m: raise IndexError, "%s not found" % iata
return m.group(1), m.group(2)
_TestData2 = u"""
Coordinates |
40¡ 46' 38.07" N
73¡ 52' 21.39" W |
"""
# ' help emacs
_TestData3 = u"""
Coordinates |
39¡ 17' 51" N
94¡ 42' 50" W
|
"""
# ' help emacs
_TestData4 = u"""
Coordinates |
45¡4047N, 74¡0219W |
"""
_TestData5 = u"""
Coordinates |
51°28\u203239\u2033N, 0°27\u203241\u2033W |
"""
def findcoords(txt):
"""
>>> findcoords(_TestData2)
(40.777241666666669, -73.872608333333332)
>>> findcoords(_TestData3)
(39.297499999999999, -94.713888888888889)
>>> findcoords(_TestData4)
(45.679722222222225, -74.038611111111109)
>>> findcoords(_TestData5)
(51.477499999999999, -0.4613888888888889)
>>> findcoords("http://tools.wikimedia.de/~magnus/geo/geohack.php?params=32_53_49_N_97_02_17_W_type:airport")
(32.896944444444443, -97.038055555555559)
>>> findcoords("http://tools.wikimedia.de/~magnus/geo/geohack.php?params=33_56_54.94_N_83_19_34.84_W_type:airport")
(33.948594444444446, -83.326344444444445)
>>> findcoords("http://tools.wikimedia.de/~magnus/geo/geohack.php?params=39.2975_N_94.7138888889_W_")
(39.297499999999999, -94.713888888900001)
"""
m = re.search(r'\.php\?params='
r'([\d]+)_([\d]+)_([\d\.]+)_([NS])'
r'_([\d]+)_([\d]+)_([\d\.]+)_([EW])_type:airport',
txt)
if m:
return dms(m.group(4), int(m.group(1)), int(m.group(2)), m.group(3)),\
dms(m.group(8), int(m.group(5)), int(m.group(6)), m.group(7))
m = re.search(r'\.php\?params='
r'([\d\.]+)_([NS])_'
r'([\d\.]+)_([EW])_',
txt)
if m:
o = m.group(2)
lat = float(m.group(1)) * (o == 'N' and 1 or -1)
o = m.group(4)
lon = float(m.group(3)) * (o == 'N' and 1 or -1)
return lat, lon
m = re.search(r'>Coordinates\s*]*>'
r'(?:]+>)?'
r'(?:]*>)?'
r'(\d+)\D*(\d+)\D*([\d\.]+)..([NS])'
r'(?:<[^>]*>)*'
r'[, \n]*'
r'(?:<[^>]*>)*'
r'(\d+)\D*(\d+)\D*([\d\.]+)..([EW])',
txt)
if m:
return dms(m.group(4), int(m.group(1)), int(m.group(2)), m.group(3)),\
dms(m.group(8), int(m.group(5)), int(m.group(6)), m.group(7))
raise ValueError, "coordinates not found"
_TestServes="""
Serves |
Baltimore, Maryland |
"""
_Spat = re.compile(r'>Serves<')
_SLpat = re.compile(r']*>([^<]+)<')
def findServes(txt):
"""Find link ref and name of city served by this airport
>>> findServes(_TestServes)
('/wiki/Baltimore%2C_Maryland', 'Baltimore, Maryland')
"""
m = _Spat.search(txt)
if not m: raise ValueError
m2 = _SLpat.search(txt, m.end(0))
if not m2: raise ValueError
return m2.group(1), m2.group(2)
Site = 'http://en.wikipedia.org'
UserAgent = 'aptdata.py/200604'
class WebCache(object):
def __init__(self, dirname):
import httplib2
self._h = httplib2.Http(dirname)
#hmm... 'max-age=3600' doesn't seem to help, given wikipedia's
#cache-control: private, s-maxage=0, max-age=0, must-revalidate
#only-if-cached works in a pinch
hdrs = {'User-agent': UserAgent,
'cache-control': 'max-stale=3600'
}
def get(self, addr):
resp, content = self._h.request(addr, headers= self.hdrs)
return resp, content
_Airports = """AHO BOS CDG DCA DFW EWR FCO LHR MCI NCE ORD PIT SFO STL
YMX YVR BRS DTW AMS GLA YYZ MAN SNA MIA ZRH BUD CMH IAD NRT MSP MEM
BRU ATL HEL MKE EDI""".split()
class batteries_included_get(object):
"""urlopener suitable for use with wikipedia
Umm... wikipedia seems to forbid use of default urllib UA string.
cf http://en.wikipedia.org/wiki/User:Skagedal/Fafafa/Code
"""
def __init__(self):
self._opener = urllib2.build_opener()
self._opener.addheaders = [('User-agent', UserAgent)]
def __call__(self, addr):
fp = self._opener.open(addr)
content = fp.read()
return fp, content
def _testonline():
for iata in _Airports:
print airportCard(batteries_included_get(), iata)
def _test():
import doctest
doctest.testmod()
def _progress(*args):
import sys
for a in args:
sys.stderr.write('%s ' % a)
sys.stderr.write("\n")
import BaseHTTPServer
class WikiAirportServer(BaseHTTPServer.HTTPServer):
def __init__(self, addr, handlerClass,
web, template):
BaseHTTPServer.HTTPServer.__init__(self, addr, handlerClass)
self.web = web
self.template = template
class WikiAirportHandler(BaseHTTPServer.BaseHTTPRequestHandler):
def do_GET(self):
if self.path.startswith("/apt/"):
iata = self.path[len("/apt/"):]
try:
txts = self.airportPage(iata)
except IndexError:
self.notFound()
return
self.send_response(200)
# hmm... content type from cmd line, along with template?
self.send_header("Content-type", "application/rdf+xml")
self.end_headers()
for txt in txts:
self.wfile.write(txt)
else:
self.notFound()
def airportPage(self, iata):
"""raises IndexError in the case of no such airport"""
s = self.server
template = s.template
template.contacts = [airportCard(s.web, iata)]
template.path = ''
return template.generate(output='xml', encoding='utf-8')
def notFound(self):
"""Report an HTTP 404 'Not Found' error and
give a link to something to try.
"""
s = self.server
self.send_response(404)
self.send_header("Content-type", "text/html")
self.end_headers()
self.wfile.write("""
DBViewHandler: 404: Not Found
Not Found (404)
Try something like LGA
or pick some other airport code.
""")
def serve():
import sys
import httplib2
port = 8123
access = WebCache("wikipedia-cache").get
kidfn = sys.argv[1] # raises KeyError
template=kid.Template(file=kidfn)
s = WikiAirportServer(('', port),
WikiAirportHandler, access, template)
print 'Wiki airport data server running on port: ', port
s.serve_forever()
def main(argv):
if '--cache' in argv:
import httplib2
#@@ kludge.. hardcoded filename
access = WebCache("wikipedia-cache").get
del argv[argv.index("--cache")]
else:
access = batteries_included_get()
kidfn = argv[1] # raises KeyError
template=kid.Template(file=kidfn)
template.contacts = [airportCard(access, code) for code in argv[2:]]
template.path = ''
for txt in template.generate(output='xml', encoding='utf-8'):
sys.stdout.write(txt)
#Airports in Afghanistan
AptsIn_pat = r'Airports in ([^<]+)'
def _testcountries():
import sys
txt = sys.stdin.read()
pat = re.compile(AptsIn_pat)
delta = 0
while delta < len(txt):
m = pat.search(txt, delta)
if not m: break
delta = m.end(0)
path, name = m.group(1), m.group(2)
print "@@found: ", path, name
if __name__ == '__main__':
import sys
if '--test' in sys.argv: _test()
elif '--test2' in sys.argv: _testonline()
elif '--testC' in sys.argv: _testcountries()
elif '--serve' in sys.argv:
serve()
elif '--json' in sys.argv:
import pprint
for code in sys.argv[2:]:
print pprint.pprint(airportCard(batteries_included_get(), code))
else:
try:
main(sys.argv)
except IndexError, KeyError:
print >>sys.stderr, __doc__
sys.exit(2)
|