Scrape Multiple Addresses From Multiple Files In Scrapy
Solution 1:
I see two methods
First:
Run spider many times with different parameters. It will need less code.
You can create batch with many lines with different arguments added manually.
First argument is output filename -o result1.csv
which scrapy will create automatically.
Second argument is input filename -a filename=process1.csv
with links.
scrapy crawl hotel_info -o result1.csv -a filename=process1.csv
scrapy crawl hotel_info -o result2.csv -a filename=process2.csv
scrapy crawl hotel_info -o result3.csv -a filename=process3.csv
...
a it needs only to get filename
in __init__
import scrapy
from os.path import isfile, join
import json
classHotelInfoSpider(scrapy.Spider):
name = 'hotel_info'
allowed_domains = ['lastsecond.ir']
def__init__(self, filename, *args, **kwargs): # <-- filenamesuper().__init__(*args, **kwargs)
filename = join('lastsecond/hotels/', filename)
if isfile(filename):
withopen(filename) as f:
data = json.load(f)
self.start_urls = data['links']
defparse(self, response):
print('url:', response.url)
yield {'url':, response.url, 'other': ...}
You can also use Python script with CrawlerProcess
to run spider many times.
from scrapy.crawler import CrawlerProcess
import HotelInfoSpider
from os.path import isfile, join
import json
files = [f for f in listdir('lastsecond/hotels/') if isfile(join('lastsecond/hotels/', f))]
for i, input_file inenumerate(files):
output_file = 'result{}.csv'.format(i)
c = CrawlerProcess({'FEED_FORMAT': 'csv','FEED_URI': output_file})
c.crawl(HotelInfoSpider, filename=input_file) #input_file='process1.csv')
c.start()
Or using scrapy.cmdline.execute()
import scrapy.cmdline
from os.path import isfile, join
importjsonfiles= [f for f in listdir('lastsecond/hotels/')ifisfile(join('lastsecond/hotels/', f))]
for i, input_file in enumerate(files):
output_file = 'result{}.csv'.format(i)
scrapy.cmdline.execute(["scrapy", "crawl", "hotel_info", "-o", output_file, "-a""filename=" + input_file])
Second:
It needs more code because you have to create Pipeline Exporter which will use different files to save results.
You have to use start_requests()
and Request(..., meta=...)
to create start_urls
with requests which will have extra
data in meta
which you can use later to save in different files.
In parse()
you have to get this extra
from meta
and add to item
.
In pipeline exporter you have to get extra
from item
and open different file.
import scrapy
from os import listdir
from os.path import isfile, join
import json
classHotelInfoSpider(scrapy.Spider):
name = 'hotel_info'
allowed_domains = ['lastsecond.ir']
defstart_requests(self):
# get start urls from links list of every file
files = [f for f in listdir('lastsecond/hotels/') if isfile(join('lastsecond/hotels/', f))]
for i, filename inenumerate(files):
withopen('lastsecond/hotels/' + filename) as f:
data = json.load(f)
links = data["links"]
for url in links:
yield scrapy.Request(url, meta={'extra': i})
defparse(self, response):
print('url:', response.url)
extra = response.meta['extra']
print('extra:', extra)
yield {'url': response.url, 'extra': extra, 'other': ...}
pipelines.py
classMyExportPipeline(object):
defprocess_item(self, item, spider):
# get extra and use it in filename
filename = 'result{}.csv'.format(item['extra'])
# open file for appendingwithopen(filename, 'a') as f:
writer = csv.writer(f)
# write only selected elements - skip `extra`
row = [item['url'], item['other']
writer.writerow(row)
return item
settings.py
ITEM_PIPELINES = {
'your_porject_name.pipelines.MyExportPipeline': 300,
}
Solution 2:
You could manage all the files with a dict
:
d_hotel_info = {}
for file in files:
withopen('lastsecond/hotels/' + file, 'r') as hotel_info:
hotel = json.load(hotel_info)
d_hotel_info[file] = hotel
and then when you want to output, you reference the keys of d_hotel_info
Post a Comment for "Scrape Multiple Addresses From Multiple Files In Scrapy"