EADST

Python: Obtain Baidu Images Using Web Crawler

Python: Obtain Baidu Images Using Web Crawler.

Here is the main code.

# -- coding: utf-8 --
import os
import re
import time
import requests

class CarCollect():

def __init__(self, path='./name.txt'):
    self.num = 1
    self.class_number = 0
    self.line_list = []
    with open(path, encoding='utf-8') as file:
        self.line_list = [k.strip() for k in file.readlines()]
        self.class_number = int(self.line_list[0])
        self.line_list = self.line_list[1:]

def dowmload_picture(self, html, keyword, save_path):
    pic_url = re.findall('"objURL":"(.*?)",', html, re.S)  # get image url
    print('Finding keyword: ' + keyword + ' images, start downloading...')
    for each in pic_url:
        print('='*60)
        print('Downloading ' + keyword + ' number ' + str(self.num) + ' image, url: ' + str(each))
        try:
            if each:
                pic = requests.get(each, timeout=7)
                string = save_path + r'\\' + keyword + '_' + str(self.num) + '.jpg'
                if len(pic.content) > 10000: # img size > 10k
                    with open(string, 'wb') as fp:
                        fp.write(pic.content)
                        self.num += 1
        except BaseException:
            print('error, cannot download')
        if self.num > self.class_number:
            break

def __call__(self):
    headers = {
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
        'Upgrade-Insecure-Requests': '1'
    }
    session = requests.Session()
    session.headers = headers

    for word in self.line_list:
        # create a folder
        save_path = word + '_file'
        time_now = time.strftime("%Y%m%d_%H%M%S", time.localtime())
        save_path += "_" + time_now
        os.mkdir(save_path)
        # get images
        url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn='
        image_number = 0
        self.num = 1
        while image_number < self.class_number:
            try:
                result = session.get(url + str(image_number), timeout=10, allow_redirects=False)
                self.dowmload_picture(result.text, word, save_path)
            except:
                print('Internet error')
            image_number += 60

if name == 'main': path = './keywords.txt' car_collect = CarCollect(path) car_collect() print('Done.')

Here is the text file, keywords.txt. The first line is the number we want to obtain from each keyword. The following lines are the keywords.

20
Dog
Cat

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
Dataset GPTQ Qwen2 Zip Hilton Pytorch Distillation Use Jetson Michelin NLTK Base64 Magnet Color YOLO OpenCV VGG-16 Paddle C++ VPN API tqdm Paper diffusers Augmentation 签证 Bert Translation Card Logo Datetime 腾讯云 torchinfo Nginx Random Proxy Hotel Vim Anaconda 多线程 Vmess Pandas RGB Quantization NameSilo Llama Conda Ptyhon mmap MD5 PDF v0.dev Google TTS 证件照 Web BTC llama.cpp RAR 飞书 Qwen TSV BF16 CLAP Hungarian ResNet-50 Excel QWEN 音频 CTC 阿里云 ModelScope FP16 Password WAN Streamlit CUDA FP8 Numpy 版权 Pickle Windows Plotly Land transformers Claude FastAPI Algorithm Food Crawler scipy CC IndexTTS2 Django Firewall EXCEL PyCharm GIT BeautifulSoup Review Mixtral VSCode SQLite Heatmap TensorRT Clash Interview v2ray OCR JSON Pillow Transformers uwsgi Ubuntu LLAMA Input Markdown Permission Statistics FlashAttention ONNX UNIX FP64 Git Python Image2Text Tiktoken PyTorch CAM Shortcut Knowledge SAM Data TensorFlow SPIE Template ChatGPT hf Website Bitcoin Tracking uWSGI UI HaggingFace LeetCode LoRA Gemma CEIR CSV 算法题 GoogLeNet Disk GGML LaTeX AI Docker 关于博主 净利润 报税 Qwen2.5 Baidu Domain XML 视频信息 财报 PDB Diagram Linux Plate Github Freesound InvalidArgumentError Cloudreve Bipartite 搞笑 域名 Video Safetensors Jupyter DeepStream SVR Bin DeepSeek 继承 FP32 printf Animate Quantize Attention tar Sklearn OpenAI Breakpoint SQL Tensor XGBoost COCO Miniforge 公式 git-lfs NLP HuggingFace PIP logger git Math CV LLM WebCrawler GPT4 多进程
站点统计

本站现有博文311篇,共被浏览739761

本站已经建立2376天!

热门文章
文章归档
回到顶部