Creating a faster crawler
João Júnior
PyMalta - December 5, 2018
Sequential
Parallelism
Concurrent
API
import time
from flask import Flask
app = Flask(__name__)
TIME_FASTER, TIME_SLOWLY = 1, 10
@app.route("/faster")
def faster():
time.sleep(TIME_FASTER); return "Faster!"
@app.route("/slowly")
def slowly():
time.sleep(TIME_SLOWLY); return "Slowly!"
@app.route("/text")
def text():
return "Text!"
Crawler Sequential
import requests
from constants import URL_FASTER, URL_SLOWLY
def crawler(url):
response = requests.get(url)
return response.status_code
if __name__ == '__main__':
crawler(URL_SLOWLY)
for i in range(20):
crawler(URL_FASTER)
Crawler Concurrent - Threads
from threading import Thread
import requests
from constants import URL_FASTER, URL_SLOWLY
def crawler(url):
response = requests.get(url)
return response.status_code
threads = [Thread(target=crawler, args=(URL_SLOWLY,))]
for i in range(20):
threads.append(Thread(target=crawler, args=(URL_FASTER,)))
for t in threads:
t.start()
for t in threads:
t.join()
Crawler Concurrent - GreenThreads
import requests
import gevent.monkey
from gevent import Greenlet
from constants import URL_FASTER, URL_SLOWLY
gevent.monkey.patch_socket()
def crawler(url):
response = requests.get(url)
return response.status_code
gthreads = [Greenlet(crawler, URL_SLOWLY)]
for i in range(20):
gthreads.append(Greenlet(crawler, URL_FASTER))
for gthread in gthreads: gthread.start()
gevent.joinall(gthreads)
Crawler Concurrent - AsyncIO
import asyncio
import aiohttp
from constants import URL_FASTER, URL_SLOWLY
async def crawler(url):
async with aiohttp.ClientSession() as session:
response = await session.get(url)
return response.status
loop = asyncio.get_event_loop()
futures = [asyncio.ensure_future(crawler(URL_SLOWLY))]
for i in range(20):
futures.append(asyncio.ensure_future(crawler(URL_FASTER)))
loop.run_until_complete(asyncio.gather(*futures))
Crawler "Parallel" - Multiprocessing
from multiprocessing import Pool
import requests
from constants import URL_FASTER, URL_SLOWLY
def crawler(url):
response = requests.get(url)
return response.status_code
p = Pool(21)
urls = [URL_SLOWLY]
for i in range(20):
urls.append(URL_FASTER)
p.map(crawler, urls)