EADST

Python: Obtain Baidu Images Using Web Crawler

Python: Obtain Baidu Images Using Web Crawler.

Here is the main code.

# -- coding: utf-8 --
import os
import re
import time
import requests

class CarCollect():

def __init__(self, path='./name.txt'):
    self.num = 1
    self.class_number = 0
    self.line_list = []
    with open(path, encoding='utf-8') as file:
        self.line_list = [k.strip() for k in file.readlines()]
        self.class_number = int(self.line_list[0])
        self.line_list = self.line_list[1:]

def dowmload_picture(self, html, keyword, save_path):
    pic_url = re.findall('"objURL":"(.*?)",', html, re.S)  # get image url
    print('Finding keyword: ' + keyword + ' images, start downloading...')
    for each in pic_url:
        print('='*60)
        print('Downloading ' + keyword + ' number ' + str(self.num) + ' image, url: ' + str(each))
        try:
            if each:
                pic = requests.get(each, timeout=7)
                string = save_path + r'\\' + keyword + '_' + str(self.num) + '.jpg'
                if len(pic.content) > 10000: # img size > 10k
                    with open(string, 'wb') as fp:
                        fp.write(pic.content)
                        self.num += 1
        except BaseException:
            print('error, cannot download')
        if self.num > self.class_number:
            break

def __call__(self):
    headers = {
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
        'Upgrade-Insecure-Requests': '1'
    }
    session = requests.Session()
    session.headers = headers

    for word in self.line_list:
        # create a folder
        save_path = word + '_file'
        time_now = time.strftime("%Y%m%d_%H%M%S", time.localtime())
        save_path += "_" + time_now
        os.mkdir(save_path)
        # get images
        url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn='
        image_number = 0
        self.num = 1
        while image_number < self.class_number:
            try:
                result = session.get(url + str(image_number), timeout=10, allow_redirects=False)
                self.dowmload_picture(result.text, word, save_path)
            except:
                print('Internet error')
            image_number += 60

if name == 'main': path = './keywords.txt' car_collect = CarCollect(path) car_collect() print('Done.')

Here is the text file, keywords.txt. The first line is the number we want to obtain from each keyword. The following lines are the keywords.

20
Dog
Cat

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
财报 Safetensors PIP git-lfs TTS 签证 Jetson GIT Food CSV BTC OCR Logo tqdm C++ UNIX PyCharm Hungarian Web Magnet Cloudreve NameSilo HuggingFace CV Baidu torchinfo 报税 uwsgi Input HaggingFace Diagram ONNX LLAMA printf Qwen2.5 腾讯云 Freesound Plate COCO FlashAttention Pillow Agent ResNet-50 NLTK FP8 Disk Video WAN PyTorch Claude 版权 Website Bitcoin Transformers tar CLAP DeepSeek Zip EXCEL LLM Python VGG-16 Clash Land TensorFlow RAR Random GPTQ XGBoost Knowledge Bin Bert UI Attention Google VSCode Ptyhon Vmess Pandas Review 阿里云 scipy FP32 Linux FP64 Datetime 第一性原理 NLP Sklearn 多进程 证件照 继承 API RGB PDB llama.cpp Domain Excel ChatGPT CAM ModelScope SVR TSV Tracking BF16 Windows CC uWSGI Miniforge Paddle SAM SQL 递归学习法 TensorRT diffusers GGML Gemma Distillation 公式 XML CEIR Math IndexTTS2 Color Proxy v2ray LaTeX 净利润 transformers JSON Bipartite 音频 图形思考法 Markdown v0.dev Hotel Statistics 域名 Michelin Crawler Dataset mmap PDF MD5 Base64 SQLite Quantization 飞书 LeetCode WebCrawler Qwen2 Streamlit Data 算法题 Hilton CUDA Paper Mixtral Firewall Llama Shortcut Github logger Git FastAPI SPIE 搞笑 hf InvalidArgumentError CTC Algorithm FP16 Tiktoken Anaconda YOLO OpenAI Nginx Vim Breakpoint Django AI Translation Password BeautifulSoup Pytorch DeepStream Use Qwen git 多线程 Card Jupyter Permission 关于博主 LoRA OpenCV GoogLeNet Tensor Docker Numpy Image2Text Heatmap Plotly Pickle Animate Template GPT4 Augmentation Conda Quantize QWEN Interview VPN Ubuntu
站点统计

本站现有博文316篇,共被浏览747850

本站已经建立2397天!

热门文章
文章归档
回到顶部