EADST

Python: Obtain Baidu Images Using Web Crawler

Python: Obtain Baidu Images Using Web Crawler.

Here is the main code.

# -- coding: utf-8 --
import os
import re
import time
import requests

class CarCollect():

def __init__(self, path='./name.txt'):
    self.num = 1
    self.class_number = 0
    self.line_list = []
    with open(path, encoding='utf-8') as file:
        self.line_list = [k.strip() for k in file.readlines()]
        self.class_number = int(self.line_list[0])
        self.line_list = self.line_list[1:]

def dowmload_picture(self, html, keyword, save_path):
    pic_url = re.findall('"objURL":"(.*?)",', html, re.S)  # get image url
    print('Finding keyword: ' + keyword + ' images, start downloading...')
    for each in pic_url:
        print('='*60)
        print('Downloading ' + keyword + ' number ' + str(self.num) + ' image, url: ' + str(each))
        try:
            if each:
                pic = requests.get(each, timeout=7)
                string = save_path + r'\\' + keyword + '_' + str(self.num) + '.jpg'
                if len(pic.content) > 10000: # img size > 10k
                    with open(string, 'wb') as fp:
                        fp.write(pic.content)
                        self.num += 1
        except BaseException:
            print('error, cannot download')
        if self.num > self.class_number:
            break

def __call__(self):
    headers = {
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
        'Upgrade-Insecure-Requests': '1'
    }
    session = requests.Session()
    session.headers = headers

    for word in self.line_list:
        # create a folder
        save_path = word + '_file'
        time_now = time.strftime("%Y%m%d_%H%M%S", time.localtime())
        save_path += "_" + time_now
        os.mkdir(save_path)
        # get images
        url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn='
        image_number = 0
        self.num = 1
        while image_number < self.class_number:
            try:
                result = session.get(url + str(image_number), timeout=10, allow_redirects=False)
                self.dowmload_picture(result.text, word, save_path)
            except:
                print('Internet error')
            image_number += 60

if name == 'main': path = './keywords.txt' car_collect = CarCollect(path) car_collect() print('Done.')

Here is the text file, keywords.txt. The first line is the number we want to obtain from each keyword. The following lines are the keywords.

20
Dog
Cat

相关标签
About Me
XD
Goals determine what you are going to be.
Category
标签云
Review Qwen2 Quantize CSV QWEN YOLO TensorFlow 版权 Numpy Claude git ModelScope AI SVR 飞书 SAM Hotel Windows EXCEL Heatmap Hilton 腾讯云 Image2Text LLAMA Ubuntu uwsgi torchinfo Firewall Animate OCR Cloudreve Mixtral VGG-16 Permission WebCrawler Distillation VSCode 财报 hf Card 多进程 Use Website NLTK Diagram Disk LaTeX FP16 BeautifulSoup Web RGB Michelin CLAP uWSGI Freesound llama.cpp Tensor Template BTC Pytorch GIT Plate IndexTTS2 Github Search Hungarian tar FP8 报税 FP64 Excel Git 图形思考法 Django SQL News HuggingFace ChatGPT Crawler UI CAM PIP HaggingFace tqdm PDF JSON Attention 净利润 Dataset GPTQ Math Clash PDB PyCharm CTC Llama Datetime Paddle Interview Qwen Python BF16 icon Pillow transformers Augmentation 云服务器 Proxy Bipartite LoRA mmap Markdown Food Sklearn Agent diffusers OpenAI Safetensors v2ray Linux 公式 Docker GoogLeNet 算法题 Pandas Magnet NLP Video Logo Breakpoint Pickle CC Ptyhon UNIX Password Google TTS 域名 搞笑 logger DeepStream Statistics Tracking 关于博主 音频 Domain ONNX Tiktoken Transformers XML MD5 Bitcoin Zip OpenCV Quantization 多线程 InvalidArgumentError Jetson C++ 第一性原理 Shortcut printf git-lfs FastAPI NameSilo Conda FP32 图标 SQLite 强化学习 Data Anaconda Translation Miniforge API CUDA Jupyter FlashAttention TSV CV v0.dev ResNet-50 GGML LLM DeepSeek Base64 scipy COCO LeetCode Gemma Nginx 顶会 Bert CEIR SPIE 证件照 递归学习法 WAN VPN Qwen2.5 Plotly Knowledge Streamlit Input Algorithm RAR PyTorch Color 继承 Land Paper TensorRT 签证 Vmess GPT4 Random XGBoost Baidu 阿里云 Vim Bin
站点统计

本站现有博文322篇,共被浏览788931

本站已经建立2484天!

热门文章
文章归档
回到顶部