EADST

Python: Obtain Baidu Images Using Web Crawler

Python: Obtain Baidu Images Using Web Crawler.

Here is the main code.

# -- coding: utf-8 --
import os
import re
import time
import requests

class CarCollect():

def __init__(self, path='./name.txt'):
    self.num = 1
    self.class_number = 0
    self.line_list = []
    with open(path, encoding='utf-8') as file:
        self.line_list = [k.strip() for k in file.readlines()]
        self.class_number = int(self.line_list[0])
        self.line_list = self.line_list[1:]

def dowmload_picture(self, html, keyword, save_path):
    pic_url = re.findall('"objURL":"(.*?)",', html, re.S)  # get image url
    print('Finding keyword: ' + keyword + ' images, start downloading...')
    for each in pic_url:
        print('='*60)
        print('Downloading ' + keyword + ' number ' + str(self.num) + ' image, url: ' + str(each))
        try:
            if each:
                pic = requests.get(each, timeout=7)
                string = save_path + r'\\' + keyword + '_' + str(self.num) + '.jpg'
                if len(pic.content) > 10000: # img size > 10k
                    with open(string, 'wb') as fp:
                        fp.write(pic.content)
                        self.num += 1
        except BaseException:
            print('error, cannot download')
        if self.num > self.class_number:
            break

def __call__(self):
    headers = {
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
        'Upgrade-Insecure-Requests': '1'
    }
    session = requests.Session()
    session.headers = headers

    for word in self.line_list:
        # create a folder
        save_path = word + '_file'
        time_now = time.strftime("%Y%m%d_%H%M%S", time.localtime())
        save_path += "_" + time_now
        os.mkdir(save_path)
        # get images
        url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn='
        image_number = 0
        self.num = 1
        while image_number < self.class_number:
            try:
                result = session.get(url + str(image_number), timeout=10, allow_redirects=False)
                self.dowmload_picture(result.text, word, save_path)
            except:
                print('Internet error')
            image_number += 60

if name == 'main': path = './keywords.txt' car_collect = CarCollect(path) car_collect() print('Done.')

Here is the text file, keywords.txt. The first line is the number we want to obtain from each keyword. The following lines are the keywords.

20
Dog
Cat

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
站点统计

本站现有博文242篇,共被浏览288374

本站已经建立1774天!

热门文章
文章归档
回到顶部