EADST

Python: Obtain Baidu Images Using Web Crawler

Python: Obtain Baidu Images Using Web Crawler.

Here is the main code.

# -- coding: utf-8 --
import os
import re
import time
import requests

class CarCollect():

def __init__(self, path='./name.txt'):
    self.num = 1
    self.class_number = 0
    self.line_list = []
    with open(path, encoding='utf-8') as file:
        self.line_list = [k.strip() for k in file.readlines()]
        self.class_number = int(self.line_list[0])
        self.line_list = self.line_list[1:]

def dowmload_picture(self, html, keyword, save_path):
    pic_url = re.findall('"objURL":"(.*?)",', html, re.S)  # get image url
    print('Finding keyword: ' + keyword + ' images, start downloading...')
    for each in pic_url:
        print('='*60)
        print('Downloading ' + keyword + ' number ' + str(self.num) + ' image, url: ' + str(each))
        try:
            if each:
                pic = requests.get(each, timeout=7)
                string = save_path + r'\\' + keyword + '_' + str(self.num) + '.jpg'
                if len(pic.content) > 10000: # img size > 10k
                    with open(string, 'wb') as fp:
                        fp.write(pic.content)
                        self.num += 1
        except BaseException:
            print('error, cannot download')
        if self.num > self.class_number:
            break

def __call__(self):
    headers = {
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
        'Upgrade-Insecure-Requests': '1'
    }
    session = requests.Session()
    session.headers = headers

    for word in self.line_list:
        # create a folder
        save_path = word + '_file'
        time_now = time.strftime("%Y%m%d_%H%M%S", time.localtime())
        save_path += "_" + time_now
        os.mkdir(save_path)
        # get images
        url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn='
        image_number = 0
        self.num = 1
        while image_number < self.class_number:
            try:
                result = session.get(url + str(image_number), timeout=10, allow_redirects=False)
                self.dowmload_picture(result.text, word, save_path)
            except:
                print('Internet error')
            image_number += 60

if name == 'main': path = './keywords.txt' car_collect = CarCollect(path) car_collect() print('Done.')

Here is the text file, keywords.txt. The first line is the number we want to obtain from each keyword. The following lines are the keywords.

20
Dog
Cat

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
Pytorch TensorFlow CUDA Clash PIP Python 图形思考法 Paper Color CSV scipy VGG-16 Video Hilton JSON HaggingFace PDB Tracking Bitcoin 飞书 Website BeautifulSoup Jupyter Pillow Plate Translation 第一性原理 Streamlit Claude Quantization diffusers SVR Vim v0.dev WAN Bin InvalidArgumentError Random Card FP8 OCR YOLO PDF GPTQ 域名 Image2Text Conda Base64 SQL MD5 XML tar OpenAI EXCEL News NameSilo UNIX Land HuggingFace Github 搞笑 DeepStream 音频 uwsgi Numpy Google ONNX Datetime transformers Nginx 继承 Augmentation Ptyhon Dataset 净利润 Domain QWEN Qwen Heatmap Template Bert Excel Tiktoken Django Input git Search Hotel Miniforge torchinfo ChatGPT 版权 C++ PyTorch Gemma Breakpoint Crawler Michelin Git 腾讯云 UI RGB Freesound Anaconda 强化学习 API ResNet-50 Shortcut Paddle ModelScope XGBoost Tensor 财报 GPT4 公式 DeepSeek CTC 签证 Proxy 云服务器 TTS Qwen2 Statistics Llama FastAPI Food RAR WebCrawler mmap CC CLAP Pandas TSV uWSGI LoRA LeetCode PyCharm Interview SQLite SPIE v2ray FP32 Jetson 证件照 COCO NLP TensorRT BTC Firewall hf GoogLeNet Baidu llama.cpp Markdown NLTK SAM CAM LLM 多进程 Quantize LLAMA Animate VSCode git-lfs GIT FP64 阿里云 Sklearn Magnet Safetensors GGML 算法题 Data Attention CEIR Docker Distillation BF16 printf Algorithm Disk Vmess Pickle Mixtral 报税 Math VPN Transformers Bipartite logger Plotly IndexTTS2 Review Cloudreve Zip OpenCV Password Agent Qwen2.5 多线程 Ubuntu Hungarian FlashAttention CV 顶会 Web 递归学习法 LaTeX 关于博主 AI FP16 Use Linux Knowledge Logo Windows Permission tqdm Diagram
站点统计

本站现有博文321篇,共被浏览773459

本站已经建立2462天!

热门文章
文章归档
回到顶部