EADST

Python: Obtain Baidu Images Using Web Crawler

Python: Obtain Baidu Images Using Web Crawler.

Here is the main code.

# -- coding: utf-8 --
import os
import re
import time
import requests

class CarCollect():

def __init__(self, path='./name.txt'):
    self.num = 1
    self.class_number = 0
    self.line_list = []
    with open(path, encoding='utf-8') as file:
        self.line_list = [k.strip() for k in file.readlines()]
        self.class_number = int(self.line_list[0])
        self.line_list = self.line_list[1:]

def dowmload_picture(self, html, keyword, save_path):
    pic_url = re.findall('"objURL":"(.*?)",', html, re.S)  # get image url
    print('Finding keyword: ' + keyword + ' images, start downloading...')
    for each in pic_url:
        print('='*60)
        print('Downloading ' + keyword + ' number ' + str(self.num) + ' image, url: ' + str(each))
        try:
            if each:
                pic = requests.get(each, timeout=7)
                string = save_path + r'\\' + keyword + '_' + str(self.num) + '.jpg'
                if len(pic.content) > 10000: # img size > 10k
                    with open(string, 'wb') as fp:
                        fp.write(pic.content)
                        self.num += 1
        except BaseException:
            print('error, cannot download')
        if self.num > self.class_number:
            break

def __call__(self):
    headers = {
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
        'Upgrade-Insecure-Requests': '1'
    }
    session = requests.Session()
    session.headers = headers

    for word in self.line_list:
        # create a folder
        save_path = word + '_file'
        time_now = time.strftime("%Y%m%d_%H%M%S", time.localtime())
        save_path += "_" + time_now
        os.mkdir(save_path)
        # get images
        url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn='
        image_number = 0
        self.num = 1
        while image_number < self.class_number:
            try:
                result = session.get(url + str(image_number), timeout=10, allow_redirects=False)
                self.dowmload_picture(result.text, word, save_path)
            except:
                print('Internet error')
            image_number += 60

if name == 'main': path = './keywords.txt' car_collect = CarCollect(path) car_collect() print('Done.')

Here is the text file, keywords.txt. The first line is the number we want to obtain from each keyword. The following lines are the keywords.

20
Dog
Cat

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
CC BeautifulSoup 图形思考法 强化学习 Bin CAM ModelScope Shortcut HaggingFace Attention Docker News 公式 PIP logger C++ Clash GPT4 Sklearn uWSGI tar 腾讯云 Qwen2.5 SPIE Crawler Rebuttal Firewall Miniforge Anaconda Review Qwen2 Safetensors InvalidArgumentError 音频 版权 Input 递归学习法 Use Logo PDF OpenCV TensorFlow FP16 Bipartite Bitcoin 财报 GPTQ Gemma Proxy Baidu Translation Django DeepStream Windows 搞笑 VPN Claude 飞书 SAM Cloudreve LeetCode mmap OpenAI Ptyhon LLAMA Quantization 图标 llama.cpp Streamlit OCR Conda Land UNIX RGB Google Qwen Pillow Domain icon ONNX XGBoost scipy 算法题 Color NameSilo TSV Ubuntu Vmess COCO TensorRT Jupyter Mixtral Video FastAPI Data NLP Animate torchinfo 论文速读 Base64 CLAP Git WAN Breakpoint JSON Markdown Diagram MD5 Nginx XML 签证 VGG-16 FP8 FP64 uwsgi 阿里云 SQL ResNet-50 域名 Tensor 顶会 Numpy IndexTTS2 继承 Pickle Magnet SQLite Web YOLO Augmentation Excel API GoogLeNet PyCharm Plate PDB Random Tracking v0.dev Tiktoken Pytorch Math CTC WebCrawler 云服务器 v2ray Image2Text Vim BTC TTS Zip 证件照 PyTorch 报税 多线程 Algorithm Template Freesound VSCode Website ChatGPT NLTK Linux Password BF16 Hilton DeepSeek Llama QWEN AI 关于博主 Quantize Dataset Disk Distillation Datetime HuggingFace 第一性原理 diffusers FlashAttention transformers Michelin Agent LoRA Plotly Python Transformers 多进程 Bert tqdm Heatmap CSV git 净利润 Paddle printf CEIR GGML hf CV Github UI Statistics Food Paper Interview GIT EXCEL Hungarian LaTeX CUDA git-lfs Hotel SVR RAR Jetson Permission FP32 论文 Card Pandas Knowledge Search LLM
站点统计

本站现有博文327篇,共被浏览825891

本站已经建立2532天!

热门文章
文章归档
回到顶部