I use asyncio to speed up web scraping. I collect only title, author, tags, datetime, total comments from list view from specific website. Also, i collect these from all pages. I would like to improve my code, so any idea i would appreciate it.
My code:
from bs4 import BeautifulSoup, Tag
from dataclasses import dataclass
from typing import List
import aiohttp
import asyncio
from functools import reduce
from operator import iconcat
@dataclass
class Article:
title: str
author: str
tags: str
upload_on: str
comments: int
link: str
@classmethod
def from_page_items(cls, item: Tag) -> 'Article':
spans = item.find('div', {'class': 'entry__header'}).find_all('span')
entry_title = item.find('h2', {'class': 'entry__title'})
anchor = item.find('div', {'class': 'entry__header'}).find_all('a')
return cls(
title=entry_title.text.strip(),
author=anchor[1].text,
tags=anchor[2].text,
upload_on=spans[0].text,
comments=int(spans[1].text) if len(spans) > 1 else 0,
link=entry_title.find('a').get('href')
)
class Scrape:
def __init__(self, url) -> None:
self.session = None
self.url = url
async def __aenter__(self):
self.session = aiohttp.ClientSession()
return self
async def __aexit__(self, *args):
await self.session.close()
async def fetch_url(self, params: dict = {}) -> BeautifulSoup:
"""Fetch a url and return HTML document
Args:
params (dict, optional): [description]. Defaults to {}.
Returns:
BeautifulSoup: HTML document
"""
async with self.session.get(self.url, params=params) as response:
response.raise_for_status()
resp_text = await response.text()
soup = BeautifulSoup(resp_text, 'html.parser')
return soup
async def get_page_articles(self, page: int) -> List[Article]:
"""For each page return all the articles in a list contain details with Article Class
Args:
page (int): the number of page
Returns:
List[Article]: List of Article
"""
doc = await self.fetch_url(params={'p': page})
articles = [Article.from_page_items(article) for article in doc.findAll(
'article', {'class': 'entry card post-list'})]
await asyncio.sleep(1)
return articles
async def gather_articles(self) -> List[List[Article]]:
"""Gather all pages until the end of pagination `end_page`
Returns:
List[List[Article]]
"""
doc = await self.fetch_url(/api/flow.js?q=https%3A%2F%2Fcodereview.stackexchange.com%2Fquestions%2F269961%2Fself.url)
end_page_number = doc.select_one(
'ul.pagination li:last-child').find('a')['href'].split('=')[-1]
coros = [self.get_page_articles(page) for page in range(1, end_page_number + 1)]
return await asyncio.gather(*coros)
async def get_all_articles(self) -> List[Article]:
"""Gather all articles and transform to List[Article]
Returns:
List[Article]
"""
result = await self.gather_articles()
return reduce(iconcat, result, [])
async def main():
async with Scrape(BASE_URL) as scrape:
result = await scrape.get_all_articles()
print(result)
asyncio.run(main())
After that, using this code, i will store all the info into a database and play with pandas library.