EADST

Python: Obtain Baidu Images Using Web Crawler

Python: Obtain Baidu Images Using Web Crawler.

Here is the main code.

# -- coding: utf-8 --
import os
import re
import time
import requests

class CarCollect():

def __init__(self, path='./name.txt'):
    self.num = 1
    self.class_number = 0
    self.line_list = []
    with open(path, encoding='utf-8') as file:
        self.line_list = [k.strip() for k in file.readlines()]
        self.class_number = int(self.line_list[0])
        self.line_list = self.line_list[1:]

def dowmload_picture(self, html, keyword, save_path):
    pic_url = re.findall('"objURL":"(.*?)",', html, re.S)  # get image url
    print('Finding keyword: ' + keyword + ' images, start downloading...')
    for each in pic_url:
        print('='*60)
        print('Downloading ' + keyword + ' number ' + str(self.num) + ' image, url: ' + str(each))
        try:
            if each:
                pic = requests.get(each, timeout=7)
                string = save_path + r'\\' + keyword + '_' + str(self.num) + '.jpg'
                if len(pic.content) > 10000: # img size > 10k
                    with open(string, 'wb') as fp:
                        fp.write(pic.content)
                        self.num += 1
        except BaseException:
            print('error, cannot download')
        if self.num > self.class_number:
            break

def __call__(self):
    headers = {
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
        'Upgrade-Insecure-Requests': '1'
    }
    session = requests.Session()
    session.headers = headers

    for word in self.line_list:
        # create a folder
        save_path = word + '_file'
        time_now = time.strftime("%Y%m%d_%H%M%S", time.localtime())
        save_path += "_" + time_now
        os.mkdir(save_path)
        # get images
        url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn='
        image_number = 0
        self.num = 1
        while image_number < self.class_number:
            try:
                result = session.get(url + str(image_number), timeout=10, allow_redirects=False)
                self.dowmload_picture(result.text, word, save_path)
            except:
                print('Internet error')
            image_number += 60

if name == 'main': path = './keywords.txt' car_collect = CarCollect(path) car_collect() print('Done.')

Here is the text file, keywords.txt. The first line is the number we want to obtain from each keyword. The following lines are the keywords.

20
Dog
Cat

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
git BTC Firewall Numpy Sklearn PDF 签证 BeautifulSoup Breakpoint printf Agent Jetson Tensor CEIR Diagram 顶会 Tracking torchinfo 财报 SQLite QWEN Hotel LLM Hilton Qwen LaTeX Llama Cloudreve Review Pandas VPN NLTK ms-swift XML VGG-16 Shortcut CSV GPT4 icon CLAP Streamlit Rebuttal GoogLeNet 公式 AI FlashAttention Vmess Plate TensorFlow scipy uwsgi CC JSON Google YOLO 多线程 FP8 ModelScope Ubuntu WebCrawler Quantize SAM tqdm Permission ONNX 腾讯云 CAM UI Git llama.cpp OpenAI 搞笑 Pillow Card 关于博主 mmap NameSilo LeetCode hf Bert Logo Freesound Quantization uWSGI 递归学习法 WAN Pytorch GIT 音频 Augmentation DeepStream Data C++ Transformers Use Template Web 第一性原理 Math OCR GPTQ Paddle 云服务器 继承 图形思考法 ChatGPT Heatmap Website Random OpenCV Algorithm Markdown transformers Python Github 净利润 Excel FP16 Ptyhon Distillation 飞书 PDB Search 论文 Attention FastAPI NLP Mixtral InvalidArgumentError Disk 多进程 Baidu v2ray Statistics DeepSeek CTC FP64 Clash 图标 git-lfs Tiktoken TTS Plotly 域名 VSCode CUDA 证件照 阿里云 Image2Text tar Michelin SQL Crawler 论文速读 ResNet-50 PyTorch 算法题 HuggingFace UNIX Base64 Datetime Claude Dataset Django Bipartite logger API HaggingFace Color Bitcoin SPIE Paper Hungarian Proxy RAR Gemma EXCEL Qwen2 Animate Password BF16 LoRA Safetensors Knowledge CV SVR IndexTTS2 TensorRT Docker Nginx Land Translation MD5 Qwen2.5 v0.dev Input Vim PIP diffusers XGBoost 报税 版权 FP32 Conda LLAMA TSV 强化学习 RL Video RGB Miniforge Bin Windows Food News Magnet Zip COCO PyCharm GGML Domain Interview Linux Pickle Jupyter Anaconda
站点统计

本站现有博文332篇,共被浏览868343

本站已经建立2576天!

热门文章
文章归档
回到顶部