Creating a faster crawler

João Júnior

PyMalta - December 5, 2018

Sequential

Parallelism

Concurrent

API

import time
from flask import Flask

app = Flask(__name__)
TIME_FASTER, TIME_SLOWLY = 1, 10

@app.route("/faster")
def faster():
  time.sleep(TIME_FASTER); return "Faster!"

@app.route("/slowly")
def slowly():
  time.sleep(TIME_SLOWLY); return "Slowly!"

@app.route("/text")
def text():
    return "Text!"

Crawler Sequential

            import requests

from constants import URL_FASTER, URL_SLOWLY

def crawler(url):
    response = requests.get(url)
    return response.status_code

if __name__ == '__main__':
    crawler(URL_SLOWLY)
    for i in range(20):
        crawler(URL_FASTER)
          

Crawler Concurrent - Threads

            from threading import Thread
import requests
from constants import URL_FASTER, URL_SLOWLY

def crawler(url):
    response = requests.get(url)
    return response.status_code

threads = [Thread(target=crawler, args=(URL_SLOWLY,))]
for i in range(20):
    threads.append(Thread(target=crawler, args=(URL_FASTER,)))
for t in threads:
    t.start()
for t in threads:
    t.join()
          

Crawler Concurrent - GreenThreads

            import requests
import gevent.monkey
from gevent import Greenlet
from constants import URL_FASTER, URL_SLOWLY
gevent.monkey.patch_socket()

def crawler(url):
    response = requests.get(url)
    return response.status_code

gthreads = [Greenlet(crawler, URL_SLOWLY)]
for i in range(20):
    gthreads.append(Greenlet(crawler, URL_FASTER))
for gthread in gthreads: gthread.start()
gevent.joinall(gthreads)
          

Crawler Concurrent - AsyncIO

            import asyncio
import aiohttp
from constants import URL_FASTER, URL_SLOWLY

async def crawler(url):
    async with aiohttp.ClientSession() as session:
        response = await session.get(url)
        return response.status

loop = asyncio.get_event_loop()
futures = [asyncio.ensure_future(crawler(URL_SLOWLY))]
for i in range(20):
    futures.append(asyncio.ensure_future(crawler(URL_FASTER)))
loop.run_until_complete(asyncio.gather(*futures))
          

Crawler "Parallel" - Multiprocessing

            from multiprocessing import Pool
import requests
from constants import URL_FASTER, URL_SLOWLY

def crawler(url):
    response = requests.get(url)
    return response.status_code

p = Pool(21)
urls = [URL_SLOWLY]
for i in range(20):
    urls.append(URL_FASTER)
p.map(crawler, urls)
          
Benchmark #1 - Graph #1
Benchmark #1 - Graph #2
Benchmark #1 - Graph #3
Benchmark #1 - Graph #4
Benchmark #1 - Graph #5
Benchmark #2 - Graph #1
Benchmark #2 - Graph #2
Benchmark #2 - Graph #3
Benchmark #2 - Graph #4
Benchmark #2 - Graph #5
Benchmark #3 - Graph #1
Benchmark #3 - Graph #2
Benchmark #3 - Graph #3
Benchmark #3 - Graph #4
Benchmark #3 - Graph #5