EADST

Python: Obtain Baidu Images Using Web Crawler

Python: Obtain Baidu Images Using Web Crawler.

Here is the main code.

# -- coding: utf-8 --
import os
import re
import time
import requests

class CarCollect():

def __init__(self, path='./name.txt'):
    self.num = 1
    self.class_number = 0
    self.line_list = []
    with open(path, encoding='utf-8') as file:
        self.line_list = [k.strip() for k in file.readlines()]
        self.class_number = int(self.line_list[0])
        self.line_list = self.line_list[1:]

def dowmload_picture(self, html, keyword, save_path):
    pic_url = re.findall('"objURL":"(.*?)",', html, re.S)  # get image url
    print('Finding keyword: ' + keyword + ' images, start downloading...')
    for each in pic_url:
        print('='*60)
        print('Downloading ' + keyword + ' number ' + str(self.num) + ' image, url: ' + str(each))
        try:
            if each:
                pic = requests.get(each, timeout=7)
                string = save_path + r'\\' + keyword + '_' + str(self.num) + '.jpg'
                if len(pic.content) > 10000: # img size > 10k
                    with open(string, 'wb') as fp:
                        fp.write(pic.content)
                        self.num += 1
        except BaseException:
            print('error, cannot download')
        if self.num > self.class_number:
            break

def __call__(self):
    headers = {
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
        'Upgrade-Insecure-Requests': '1'
    }
    session = requests.Session()
    session.headers = headers

    for word in self.line_list:
        # create a folder
        save_path = word + '_file'
        time_now = time.strftime("%Y%m%d_%H%M%S", time.localtime())
        save_path += "_" + time_now
        os.mkdir(save_path)
        # get images
        url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn='
        image_number = 0
        self.num = 1
        while image_number < self.class_number:
            try:
                result = session.get(url + str(image_number), timeout=10, allow_redirects=False)
                self.dowmload_picture(result.text, word, save_path)
            except:
                print('Internet error')
            image_number += 60

if name == 'main': path = './keywords.txt' car_collect = CarCollect(path) car_collect() print('Done.')

Here is the text file, keywords.txt. The first line is the number we want to obtain from each keyword. The following lines are the keywords.

20
Dog
Cat

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
Ptyhon PIP Random uwsgi HaggingFace 强化学习 Website v0.dev Qwen BeautifulSoup 域名 Cloudreve v2ray NameSilo WebCrawler Bert LaTeX Pillow CEIR Baidu IndexTTS2 Heatmap tar SAM Rebuttal Qwen2 ONNX Crawler 阿里云 FP8 BTC NLTK Dataset FP64 签证 CTC SQLite Statistics Base64 云服务器 SPIE Plate GGML LeetCode Food Firewall Google FP32 transformers CV UNIX 音频 Web Quantization Jetson YOLO Vmess CAM Transformers 论文速读 Bipartite GIT VGG-16 XML Gemma 版权 QWEN Distillation tqdm Translation UI Logo Augmentation Tiktoken 图形思考法 FastAPI Permission Excel WAN Streamlit BF16 llama.cpp News GoogLeNet 多线程 Linux Docker RL JSON LLM icon 公式 Agent GPTQ DeepSeek Proxy PyTorch torchinfo InvalidArgumentError 净利润 XGBoost COCO HuggingFace Breakpoint C++ Math SVR Diagram OpenAI 飞书 CSV Mixtral Knowledge Use hf TensorFlow Claude uWSGI ChatGPT Password Bin Search Freesound CLAP FlashAttention git LLAMA 第一性原理 Ubuntu Video Hotel Vim Hungarian Magnet 关于博主 算法题 diffusers Interview Anaconda Conda PyCharm Paper Sklearn Template Safetensors Attention Pandas Django RGB MD5 Nginx ResNet-50 Quantize 腾讯云 SQL TTS Disk CUDA Miniforge ModelScope Windows TSV 证件照 NLP Markdown Image2Text Qwen2.5 Input CC Llama OCR Pytorch VPN Python Numpy FP16 Git 论文 AI Bitcoin PDB 报税 Tracking Pickle RAR PDF Tensor Animate Michelin git-lfs Github 搞笑 Zip 继承 GPT4 scipy Land 财报 Datetime API LoRA 多进程 OpenCV Plotly Algorithm TensorRT Clash 顶会 Domain Jupyter EXCEL ms-swift Data Shortcut Paddle Hilton VSCode logger Review printf 图标 Card mmap 递归学习法 DeepStream Color
站点统计

本站现有博文332篇,共被浏览868269

本站已经建立2576天!

热门文章
文章归档
回到顶部