EADST

Python: Obtain Baidu Images Using Web Crawler

Python: Obtain Baidu Images Using Web Crawler.

Here is the main code.

# -- coding: utf-8 --
import os
import re
import time
import requests

class CarCollect():

def __init__(self, path='./name.txt'):
    self.num = 1
    self.class_number = 0
    self.line_list = []
    with open(path, encoding='utf-8') as file:
        self.line_list = [k.strip() for k in file.readlines()]
        self.class_number = int(self.line_list[0])
        self.line_list = self.line_list[1:]

def dowmload_picture(self, html, keyword, save_path):
    pic_url = re.findall('"objURL":"(.*?)",', html, re.S)  # get image url
    print('Finding keyword: ' + keyword + ' images, start downloading...')
    for each in pic_url:
        print('='*60)
        print('Downloading ' + keyword + ' number ' + str(self.num) + ' image, url: ' + str(each))
        try:
            if each:
                pic = requests.get(each, timeout=7)
                string = save_path + r'\\' + keyword + '_' + str(self.num) + '.jpg'
                if len(pic.content) > 10000: # img size > 10k
                    with open(string, 'wb') as fp:
                        fp.write(pic.content)
                        self.num += 1
        except BaseException:
            print('error, cannot download')
        if self.num > self.class_number:
            break

def __call__(self):
    headers = {
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
        'Upgrade-Insecure-Requests': '1'
    }
    session = requests.Session()
    session.headers = headers

    for word in self.line_list:
        # create a folder
        save_path = word + '_file'
        time_now = time.strftime("%Y%m%d_%H%M%S", time.localtime())
        save_path += "_" + time_now
        os.mkdir(save_path)
        # get images
        url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn='
        image_number = 0
        self.num = 1
        while image_number < self.class_number:
            try:
                result = session.get(url + str(image_number), timeout=10, allow_redirects=False)
                self.dowmload_picture(result.text, word, save_path)
            except:
                print('Internet error')
            image_number += 60

if name == 'main': path = './keywords.txt' car_collect = CarCollect(path) car_collect() print('Done.')

Here is the text file, keywords.txt. The first line is the number we want to obtain from each keyword. The following lines are the keywords.

20
Dog
Cat

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
CLAP Freesound Conda Agent Vmess QWEN CEIR Tracking FP32 Breakpoint mmap Jetson Linux 多线程 Pytorch Baidu Markdown Python Michelin GPT4 Password Permission NLP Hilton API 音频 域名 Proxy IndexTTS2 Pickle 递归学习法 WAN HuggingFace 强化学习 XGBoost VPN CSV LoRA InvalidArgumentError hf BF16 uWSGI Cloudreve Disk 腾讯云 Hungarian NLTK DeepStream Dataset TensorFlow SQLite Quantization 飞书 Image2Text scipy UNIX git-lfs PDF UI SVR 版权 BTC 财报 OCR 多进程 Heatmap 搞笑 图形思考法 Bert Input printf Sklearn OpenAI Crawler Land Llama Search Card LLAMA Gemma XML PDB Zip 算法题 PyTorch Template TTS Firewall uwsgi Diagram Django 报税 JSON OpenCV Paddle CV Magnet FP64 FP16 FlashAttention v0.dev Nginx ChatGPT CUDA Datetime Food Safetensors Quantize Bitcoin PyCharm Ubuntu Hotel 继承 ResNet-50 Algorithm GIT FP8 Review Docker Miniforge LLM GPTQ Vim Pandas Augmentation Anaconda Distillation 签证 Tiktoken RAR BeautifulSoup transformers SPIE 关于博主 torchinfo Math Transformers VGG-16 Qwen2.5 Tensor Data Qwen YOLO v2ray EXCEL Statistics llama.cpp Paper Plotly MD5 Excel Qwen2 TensorRT News Claude Clash GGML COCO Domain logger VSCode Animate LeetCode TSV Jupyter 阿里云 Git tqdm WebCrawler PIP GoogLeNet ModelScope Pillow CTC Numpy tar Random DeepSeek Base64 Windows Interview 净利润 diffusers Logo C++ Plate ONNX Video 公式 Streamlit Web SAM Translation Bipartite 顶会 Ptyhon Mixtral Google CC 证件照 Color Knowledge git Bin NameSilo Use Github Website HaggingFace LaTeX RGB SQL Attention 第一性原理 Shortcut AI CAM FastAPI
站点统计

本站现有博文320篇,共被浏览756685

本站已经建立2421天!

热门文章
文章归档
回到顶部