I'm attempting to build an email scraper that takes in a csv file of urls, and returns them with email addresses; including additional urls/addresses that get scraped in the process. I can't seem to get my spider to iterate through each row in the csv file, even through they're returned fine when I test the function I'm calling.
Here's the code; which I adapted from here:
import os, re, csv, scrapy, logging
import pandas as pd
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from googlesearch import search
from time import sleep
# Avoid getting too many logs and warnings when using Scrapy inside Jupyter Notebook.
logging.getLogger('scrapy').propagate = False
# Extract urls from file.
def get_urls():
urls = pd.read_csv('food_urls.csv')
url = list(urls)
for i in url:
return urls
# Test it.
# get_urls()
# Create mail spider.
class MailSpider(scrapy.Spider):
name = 'email'
def parse(self, response):
# Search for links inside URLs.
links = LxmlLinkExtractor(allow=()).extract_links(response)
# Take in a list of URLs as input and read their source codes one by one.
links = [str(link.url) for link in links]
links.append(str(response.url))
# Send links from one parse method to another.
for link in links:
yield scrapy.Request(url=link, callback=self.parse_link)
# Pass URLS to the parse_link method — this is the method we'll apply our regex findall to look for emails
def parse_link(self, response):
html_text = str(response.text)
mail_list = re.findall('\w+@\w+\.{1}\w+', html_text)
dic = {'email': mail_list, 'link': str(response.url)}
df = pd.DataFrame(dic)
df.to_csv(self.path, mode='a', header=False)
df.to_csv(self.path, mode='a', header=False)
# Save emails in a CSV file
def ask_user(question):
response = input(question + ' y/n' + '\n')
if response == 'y':
return True
else:
return False
def create_file(path):
response = False
if os.path.exists(path):
response = ask_user('File already exists, replace?')
if response == False: return
with open(path, 'wb') as file:
file.close()
# Combine everything
def get_info(root_file, path):
create_file(path)
df = pd.DataFrame(columns=['email', 'link'], index=[0])
df.to_csv(path, mode='w', header=True)
print('Collecting urls...')
urls_list = get_urls()
print('Searching for emails...')
process = CrawlerProcess({'USER_AGENT': 'Mozilla/5.0'})
process.crawl(MailSpider, start_urls=urls_list, path=path)
process.start()
print('Cleaning emails...')
df = pd.read_csv(path, index_col=0)
df.columns = ['email', 'link']
df = df.drop_duplicates(subset='email')
df = df.reset_index(drop=True)
df.to_csv(path, mode='w', header=True)
return df
At the end, when I call df = get_info('food_urls.csv', 'food_emails.csv'), the scraper takes quite a while to run.
When it finished, I ran df.head() and got this:
email link
0 NaN NaN
1
[email protected] https://therecipecritic.com/food-blogger/
2
[email protected] https://therecipecritic.com/terms/
So, it's working, but it's only crawling the first url in the list.
Does anyone know what I'm doing wrong?
Thanks!
JavaScript questions and answers, JavaScript questions pdf, JavaScript question bank, JavaScript questions and answers pdf, mcq on JavaScript pdf, JavaScript questions and solutions, JavaScript mcq Test , Interview JavaScript questions, JavaScript Questions for Interview, JavaScript MCQ (Multiple Choice Questions)