Hi !
I would like to create a scenario that will scrap an entire site and return all the scrapped urls to me.
I thought of using 0codeKit (1SaaS) with this code: "import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
visited_links = set() # Set to store previously visited links
def get_internal_links_from_url(url, domain):
global visited_links
if url in visited_links: # If the link has already been visited, skip it
return []
visited_links.add(url)
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
Find all links and filter those that are internal to the domain
all_links = [a['href'] for a in soup. find_all('a', href=True)]
internal_links = [urljoin(url, link) for link in all_links if urlparse(link).netloc == domain or not urlparse(link).netloc]
# Recursively crawl each internal link found
for link in internal_links:
if link not in visited_links:
get_internal_links_from_url(link, domain)
return list(visited_links)
def crawl_site(url):
domain = urlparse(url).netloc # Get the domain of the site
return get_internal_links_from_url(url, domain)
result = {‘data’: crawl_site(‘https://www.***.fr’)}
"
but I got an error.
Do you know what is the way to do this?
Thx !