EADST

Python: Obtain Baidu Images Using Web Crawler

Python: Obtain Baidu Images Using Web Crawler.

Here is the main code.

# -- coding: utf-8 --
import os
import re
import time
import requests

class CarCollect():

def __init__(self, path='./name.txt'):
    self.num = 1
    self.class_number = 0
    self.line_list = []
    with open(path, encoding='utf-8') as file:
        self.line_list = [k.strip() for k in file.readlines()]
        self.class_number = int(self.line_list[0])
        self.line_list = self.line_list[1:]

def dowmload_picture(self, html, keyword, save_path):
    pic_url = re.findall('"objURL":"(.*?)",', html, re.S)  # get image url
    print('Finding keyword: ' + keyword + ' images, start downloading...')
    for each in pic_url:
        print('='*60)
        print('Downloading ' + keyword + ' number ' + str(self.num) + ' image, url: ' + str(each))
        try:
            if each:
                pic = requests.get(each, timeout=7)
                string = save_path + r'\\' + keyword + '_' + str(self.num) + '.jpg'
                if len(pic.content) > 10000: # img size > 10k
                    with open(string, 'wb') as fp:
                        fp.write(pic.content)
                        self.num += 1
        except BaseException:
            print('error, cannot download')
        if self.num > self.class_number:
            break

def __call__(self):
    headers = {
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
        'Upgrade-Insecure-Requests': '1'
    }
    session = requests.Session()
    session.headers = headers

    for word in self.line_list:
        # create a folder
        save_path = word + '_file'
        time_now = time.strftime("%Y%m%d_%H%M%S", time.localtime())
        save_path += "_" + time_now
        os.mkdir(save_path)
        # get images
        url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn='
        image_number = 0
        self.num = 1
        while image_number < self.class_number:
            try:
                result = session.get(url + str(image_number), timeout=10, allow_redirects=False)
                self.dowmload_picture(result.text, word, save_path)
            except:
                print('Internet error')
            image_number += 60

if name == 'main': path = './keywords.txt' car_collect = CarCollect(path) car_collect() print('Done.')

Here is the text file, keywords.txt. The first line is the number we want to obtain from each keyword. The following lines are the keywords.

20
Dog
Cat

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
Firewall Crawler FP16 Logo Statistics GPT4 Django VSCode Augmentation 阿里云 NameSilo XGBoost transformers Proxy Card 递归学习法 飞书 Plotly Clash PDB RAR Jetson Freesound WAN Claude 多线程 Review Datetime CUDA ChatGPT Data Dataset CEIR Nginx LaTeX Excel uWSGI CAM UI 公式 HuggingFace 搞笑 Docker Paper Attention scipy Linux Base64 BeautifulSoup Qwen2 音频 Quantize YOLO GPTQ uwsgi Password OpenAI FP64 域名 CV Baidu UNIX Magnet Pillow Domain Random tqdm Animate 第一性原理 hf Safetensors TensorRT Michelin GoogLeNet Interview Qwen2.5 Input printf 证件照 API diffusers Use Numpy SPIE Vmess SAM VPN DeepSeek torchinfo Knowledge Food PIP Zip Pickle Distillation Math SQL PyTorch tar Hungarian Sklearn PDF Tiktoken Gemma XML Qwen GIT Web PyCharm Algorithm CTC FP8 SVR Quantization Search Disk RGB BF16 C++ Streamlit DeepStream OCR COCO Pandas Git Python 顶会 NLTK Video Pytorch CLAP Image2Text Permission WebCrawler Tensor 继承 Agent 云服务器 Jupyter 多进程 ModelScope LLM ResNet-50 Llama TSV TensorFlow BTC Transformers Hotel v2ray CC Vim Mixtral TTS FastAPI Paddle FlashAttention 关于博主 Ptyhon NLP logger 财报 v0.dev Github OpenCV Plate Bipartite 图形思考法 Miniforge Cloudreve Land Website JSON 算法题 Diagram Color GGML 强化学习 Tracking EXCEL QWEN Bitcoin git-lfs News MD5 Bert 签证 Breakpoint git Markdown VGG-16 Anaconda Google LeetCode Heatmap Template Ubuntu mmap Hilton 净利润 ONNX HaggingFace 报税 Translation 版权 LLAMA Conda FP32 LoRA InvalidArgumentError Windows IndexTTS2 AI Shortcut 腾讯云 SQLite Bin llama.cpp CSV
站点统计

本站现有博文321篇,共被浏览764583

本站已经建立2442天!

热门文章
文章归档
回到顶部