Python: Obtain Baidu Images Using Web Crawler
作者:XD / 发表: 2022年11月9日 03:27 / 更新: 2022年11月9日 03:28 / 编程笔记 / 阅读量:1493
Python: Obtain Baidu Images Using Web Crawler.
Here is the main code.
# -- coding: utf-8 --
import os
import re
import time
import requests
class CarCollect():
def __init__(self, path='./name.txt'):
self.num = 1
self.class_number = 0
self.line_list = []
with open(path, encoding='utf-8') as file:
self.line_list = [k.strip() for k in file.readlines()]
self.class_number = int(self.line_list[0])
self.line_list = self.line_list[1:]
def dowmload_picture(self, html, keyword, save_path):
pic_url = re.findall('"objURL":"(.*?)",', html, re.S) # get image url
print('Finding keyword: ' + keyword + ' images, start downloading...')
for each in pic_url:
print('='*60)
print('Downloading ' + keyword + ' number ' + str(self.num) + ' image, url: ' + str(each))
try:
if each:
pic = requests.get(each, timeout=7)
string = save_path + r'\\' + keyword + '_' + str(self.num) + '.jpg'
if len(pic.content) > 10000: # img size > 10k
with open(string, 'wb') as fp:
fp.write(pic.content)
self.num += 1
except BaseException:
print('error, cannot download')
if self.num > self.class_number:
break
def __call__(self):
headers = {
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Upgrade-Insecure-Requests': '1'
}
session = requests.Session()
session.headers = headers
for word in self.line_list:
# create a folder
save_path = word + '_file'
time_now = time.strftime("%Y%m%d_%H%M%S", time.localtime())
save_path += "_" + time_now
os.mkdir(save_path)
# get images
url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn='
image_number = 0
self.num = 1
while image_number < self.class_number:
try:
result = session.get(url + str(image_number), timeout=10, allow_redirects=False)
self.dowmload_picture(result.text, word, save_path)
except:
print('Internet error')
image_number += 60
if name == 'main':
path = './keywords.txt'
car_collect = CarCollect(path)
car_collect()
print('Done.')
Here is the text file, keywords.txt
. The first line is the number we want to obtain from each keyword. The following lines are the keywords.
20
Dog
Cat