Use multithreading to speed up the crawling process by handling multiple URLs simultaneously.
Extension Code
from hypercrawlturbo import scraperfrom urllib.parse import urlparse, urljoinimport requestsfrom bs4 import BeautifulSoupimport csvimport threading# Define the URL of the webpage to scrapeurl_to_scrape ="https://hyperllm.gitbook.io/hyperllm"# Parse the domain of the URL to scrapeparsed_url =urlparse(url_to_scrape)base_domain = parsed_url.netloc# Set the depth limit for crawlingdepth_limit =2# Store the visited URLs to avoid revisitingvisited_urls =set()# Lock for thread-safe operationslock = threading.Lock()# Open a CSV file to store resultswithopen('crawled_data.csv', mode='w', newline='')as file: writer = csv.writer(file) writer.writerow(['URL', 'Title'])defis_same_domain(url,base_domain): parsed_extracted_url =urlparse(url)return parsed_extracted_url.netloc == base_domaindefcrawl(url,depth):if depth > depth_limit or url in visited_urls:returnwith lock: visited_urls.add(url)try: response = requests.get(url) response.raise_for_status() soup =BeautifulSoup(response.text, 'html.parser')# Extract and process content from the URL title = soup.title.string if soup.title else'No title'print(f"Crawling URL: {url}")print(f"Title: {title}")with lock:# Write the URL and title to the CSV file writer.writerow([url, title])# Extract URLs from the page links = [urljoin(url, a['href'])for a in soup.find_all('a', href=True)]# Filter and crawl further threads = []for link in links:ifis_same_domain(link, base_domain): thread = threading.Thread(target=crawl, args=(link, depth +1)) threads.append(thread) thread.start()for thread in threads: thread.join()except requests.RequestException as e:print(f"Failed to crawl {url}: {e}")# Start crawlingcrawl(url_to_scrape, 0)
Explanation
Function:
threading.Lock(): Ensure thread-safe operations when accessing shared resources.
Create new threads for each URL to be crawled.
Explanation:
Thread-Safe Operations: Use a lock to ensure only one thread accesses shared resources like visited_urls and the CSV file at a time.
Multithreading: Create a new thread for each URL to be crawled, allowing multiple URLs to be processed concurrently.