カットの下のスクリプトコード:
#!/usr/bin/env python """ Script for job search on odesk.com Dependencies: lxml, pycurl, grab, jinja2 """ from grab import Grab from grab.tools.work import make_work import urllib from datetime import datetime, timedelta from jinja2 import Template import webbrowser import tempfile from codecs import open import re import os import sys QUERY_LIST = ['parsing', 'parse', 'scrape', 'sraping', 'python', 'django', 'scrapy', 'scraper', 'parser', 'scrapping', 'scrapper', 'ruby'] ROOT = os.path.dirname(os.path.realpath(__file__)) TEMPLATE = """ <html><head> <style type="text/css"> b {background-color: yellow; padding: 0.2em;} .meta {font-size: 0.8em; font-family: sans-serif;} .description {background-color: #e3e3e3; padding: 0.5em;} </style> <link rel="shortcut icon" href="https://www.odesk.com/favicon.ico" /> </head><body> {% for job in jobs %} <h3>{{ job.title }}</h3> <div class="meta"> {{ job.date }} / {{ job.type }} / {{ job.amount }}$ / <a target="_blank" href="{{ job.url }}">view</a> </div> <p class="description">{{ job.description.replace('\n', '<br/>') }}</p> {% endfor %} """ def smart_str(val): """ Normalize unicode and byte strings to byte strings. """ return val.encode('utf-8') if isinstance(val, unicode) else val def highlight(data, word): """ Wrap all ``word`` entries in <b> tag. """ rex = re.compile(r'(%s)' % re.escape(word), re.I) return rex.sub(r'<b>\1</b>', data) def make_query(query): g = Grab() now = datetime.now() date_posted = (now - timedelta(days=1)).strftime('%m-%d-%Y') g.go('http://www.odesk.com/api/profiles/v1/search/jobs.xml?q=%s&dp=%s' % ( smart_str(query), date_posted)) jobs = [] for elem in g.css_list('jobs job'): def get(name): values = elem.xpath('./%s/text()' % name) return values[0] if len(values) else '-' job = { 'id': int(get('op_recno')), 'title': get('op_title'), 'type': get('job_type'), 'date': get('date_posted'), 'description': highlight(get('op_description'), query), 'amount': get('amount'), 'url': 'https://www.odesk.com/jobs/%s' % get('ciphertext'), } jobs.append(job) return jobs def main(): jobs = {} for res in make_work(make_query, QUERY_LIST, 20): for job in res: if not job['id'] in jobs: jobs[job['id']] = job jobs = sorted(jobs.values(), key=lambda x: x['id'], reverse=True) print 'Total: %d' % len(jobs) tpl = Template(TEMPLATE) html = tpl.render(jobs=jobs) if len(sys.argv) > 1 and sys.argv[1] == 'save': with open(os.path.join(ROOT, 'result.html'), 'w', 'utf-8') as out: out.write(html) else: fh, fname = tempfile.mkstemp(suffix='.html') with open(fname, 'w', 'utf-8') as out: out.write(html) webbrowser.open('file:///%s' % fname) if __name__ == '__main__': main()
使用方法:
*検索クエリをカスタマイズする
* crontab:python scriptで記述します。 py save
スクリプトは、 以前に書いたグラブ解析ライブラリを使用します。