Multithreading

Use multithreading to speed up the crawling process by handling multiple URLs simultaneously.

Extension Code


from hypercrawlturbo import scraper
from urllib.parse import urlparse, urljoin
import requests
from bs4 import BeautifulSoup
import csv
import threading

# Define the URL of the webpage to scrape
url_to_scrape = "https://hyperllm.gitbook.io/hyperllm"

# Parse the domain of the URL to scrape
parsed_url = urlparse(url_to_scrape)
base_domain = parsed_url.netloc

# Set the depth limit for crawling
depth_limit = 2

# Store the visited URLs to avoid revisiting
visited_urls = set()

# Lock for thread-safe operations
lock = threading.Lock()

# Open a CSV file to store results
with open('crawled_data.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['URL', 'Title'])

    def is_same_domain(url, base_domain):
        parsed_extracted_url = urlparse(url)
        return parsed_extracted_url.netloc == base_domain

    def crawl(url, depth):
        if depth > depth_limit or url in visited_urls:
            return
        
        with lock:
            visited_urls.add(url)
        
        try:
            response = requests.get(url)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract and process content from the URL
            title = soup.title.string if soup.title else 'No title'
            print(f"Crawling URL: {url}")
            print(f"Title: {title}")
            
            with lock:
                # Write the URL and title to the CSV file
                writer.writerow([url, title])
            
            # Extract URLs from the page
            links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
            
            # Filter and crawl further
            threads = []
            for link in links:
                if is_same_domain(link, base_domain):
                    thread = threading.Thread(target=crawl, args=(link, depth + 1))
                    threads.append(thread)
                    thread.start()
            
            for thread in threads:
                thread.join()
                    
        except requests.RequestException as e:
            print(f"Failed to crawl {url}: {e}")

    # Start crawling
    crawl(url_to_scrape, 0)

Explanation

Function:

  • threading.Lock(): Ensure thread-safe operations when accessing shared resources.

  • Create new threads for each URL to be crawled.

Explanation:

  1. Thread-Safe Operations: Use a lock to ensure only one thread accesses shared resources like visited_urls and the CSV file at a time.

  2. Multithreading: Create a new thread for each URL to be crawled, allowing multiple URLs to be processed concurrently.

Last updated