EADST

Python: Obtain Baidu Images Using Web Crawler

Python: Obtain Baidu Images Using Web Crawler.

Here is the main code.

# -- coding: utf-8 --
import os
import re
import time
import requests

class CarCollect():

def __init__(self, path='./name.txt'):
    self.num = 1
    self.class_number = 0
    self.line_list = []
    with open(path, encoding='utf-8') as file:
        self.line_list = [k.strip() for k in file.readlines()]
        self.class_number = int(self.line_list[0])
        self.line_list = self.line_list[1:]

def dowmload_picture(self, html, keyword, save_path):
    pic_url = re.findall('"objURL":"(.*?)",', html, re.S)  # get image url
    print('Finding keyword: ' + keyword + ' images, start downloading...')
    for each in pic_url:
        print('='*60)
        print('Downloading ' + keyword + ' number ' + str(self.num) + ' image, url: ' + str(each))
        try:
            if each:
                pic = requests.get(each, timeout=7)
                string = save_path + r'\\' + keyword + '_' + str(self.num) + '.jpg'
                if len(pic.content) > 10000: # img size > 10k
                    with open(string, 'wb') as fp:
                        fp.write(pic.content)
                        self.num += 1
        except BaseException:
            print('error, cannot download')
        if self.num > self.class_number:
            break

def __call__(self):
    headers = {
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
        'Upgrade-Insecure-Requests': '1'
    }
    session = requests.Session()
    session.headers = headers

    for word in self.line_list:
        # create a folder
        save_path = word + '_file'
        time_now = time.strftime("%Y%m%d_%H%M%S", time.localtime())
        save_path += "_" + time_now
        os.mkdir(save_path)
        # get images
        url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn='
        image_number = 0
        self.num = 1
        while image_number < self.class_number:
            try:
                result = session.get(url + str(image_number), timeout=10, allow_redirects=False)
                self.dowmload_picture(result.text, word, save_path)
            except:
                print('Internet error')
            image_number += 60

if name == 'main': path = './keywords.txt' car_collect = CarCollect(path) car_collect() print('Done.')

Here is the text file, keywords.txt. The first line is the number we want to obtain from each keyword. The following lines are the keywords.

20
Dog
Cat

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
PyTorch Quantize Michelin Logo tqdm GoogLeNet Datetime hf CSV SQL COCO FP32 Rebuttal git Hilton CV Gemma Firewall 腾讯云 Disk WebCrawler Proxy LaTeX Qwen2 FP64 GIT Miniforge Baidu GGML Windows HuggingFace Review BF16 Plotly Data 签证 Use v2ray Hungarian XML Jupyter FlashAttention 多进程 BTC Domain Bin 云服务器 WAN Knowledge YOLO Pillow HaggingFace Statistics Bipartite VPN Transformers VSCode Excel CC Heatmap SQLite SVR Ptyhon transformers QWEN Conda Hotel Pickle 阿里云 Web mmap Quantization 版权 UNIX Tiktoken Numpy CTC Anaconda v0.dev torchinfo TSV Linux OpenCV 关于博主 VGG-16 LoRA Paddle FastAPI Land uwsgi printf NLTK Card Qwen2.5 飞书 OpenAI Jetson Animate SPIE Llama Augmentation LLAMA Password Safetensors Plate Input 域名 Distillation CUDA PDF Claude UI FP8 icon Color Website llama.cpp Tracking CEIR Paper Bitcoin News ChatGPT git-lfs 顶会 Attention FP16 Google Search OCR 继承 SAM Github Ubuntu Clash 强化学习 Vim Video BeautifulSoup 财报 NLP DeepSeek Translation Bert DeepStream Diagram Magnet Dataset Vmess Git 净利润 音频 Math Pytorch 递归学习法 Pandas LLM API EXCEL RAR LeetCode Crawler Sklearn scipy Algorithm Tensor CAM Image2Text Agent 算法题 Shortcut 多线程 搞笑 logger IndexTTS2 TensorRT Qwen Cloudreve JSON TensorFlow diffusers PyCharm PIP Food ONNX Docker XGBoost 图标 Breakpoint tar Streamlit Nginx InvalidArgumentError ModelScope Markdown Base64 ResNet-50 第一性原理 Zip uWSGI PDB 公式 AI Random Freesound CLAP MD5 Template GPT4 C++ Django 图形思考法 GPTQ Interview Mixtral NameSilo Permission RGB Python 证件照 TTS 报税
站点统计

本站现有博文324篇,共被浏览808747

本站已经建立2511天!

热门文章
文章归档
回到顶部