2
\$\begingroup\$

I use asyncio to speed up web scraping. I collect only title, author, tags, datetime, total comments from list view from specific website. Also, i collect these from all pages. I would like to improve my code, so any idea i would appreciate it.

My code:

from bs4 import BeautifulSoup, Tag
from dataclasses import dataclass
from typing import List
import aiohttp
import asyncio
from functools import reduce
from operator import iconcat

@dataclass
class Article:
    title: str
    author: str
    tags: str
    upload_on: str
    comments: int
    link: str

    @classmethod
    def from_page_items(cls, item: Tag) -> 'Article':
        spans = item.find('div', {'class': 'entry__header'}).find_all('span')
        entry_title = item.find('h2', {'class': 'entry__title'})
        anchor = item.find('div', {'class': 'entry__header'}).find_all('a')

        return cls(
            title=entry_title.text.strip(),
            author=anchor[1].text,
            tags=anchor[2].text,
            upload_on=spans[0].text,
            comments=int(spans[1].text) if len(spans) > 1 else 0,
            link=entry_title.find('a').get('href')
        )


class Scrape:

    def __init__(self, url) -> None:
        self.session = None
        self.url = url

    async def __aenter__(self):
        self.session = aiohttp.ClientSession()
        return self

    async def __aexit__(self, *args):
        await self.session.close()

    async def fetch_url(self, params: dict = {}) -> BeautifulSoup:
        """Fetch a url and return HTML document

        Args:
            params (dict, optional): [description]. Defaults to {}.

        Returns:
            BeautifulSoup: HTML document
        """
        async with self.session.get(self.url, params=params) as response:
            response.raise_for_status()
            resp_text = await response.text()
            soup = BeautifulSoup(resp_text, 'html.parser')
        return soup

    async def get_page_articles(self, page: int) -> List[Article]:
        """For each page return all the articles in a list contain details with Article Class

        Args:
            page (int): the number of page

        Returns:
            List[Article]: List of Article
        """
        doc = await self.fetch_url(params={'p': page})
        articles = [Article.from_page_items(article) for article in doc.findAll(
            'article', {'class': 'entry card post-list'})]
        await asyncio.sleep(1)
        return articles

    async def gather_articles(self) -> List[List[Article]]:
        """Gather all pages until the end of pagination `end_page`

        Returns:
            List[List[Article]]
        """
        doc = await self.fetch_url(/api/flow.js?q=https%3A%2F%2Fcodereview.stackexchange.com%2Fquestions%2F269961%2Fself.url)
        end_page_number = doc.select_one(
            'ul.pagination li:last-child').find('a')['href'].split('=')[-1]
        coros = [self.get_page_articles(page) for page in range(1, end_page_number + 1)]
        return await asyncio.gather(*coros)

    async def get_all_articles(self) -> List[Article]:
        """Gather all articles and transform to List[Article]
        Returns:
            List[Article]
        """
        result = await self.gather_articles()
        return reduce(iconcat, result, [])



async def main():
    async with Scrape(BASE_URL) as scrape:
        result = await scrape.get_all_articles()
        print(result)

asyncio.run(main())

After that, using this code, i will store all the info into a database and play with pandas library.

\$\endgroup\$

0

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.