Extract Webpage Information with Python
作者:XD / 发表: 2020年12月29日 07:57 / 更新: 2020年12月29日 07:57 / 编程笔记 / 阅读量:2874
Here is the python program to extract webpage information with BeautifulSoup and save the data in a CSV file.
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
url = 'file:///Users/xd/Desktop/ieee/Region_5_Student_Branch_Counselors_and_Chairs.htm'
save_file = 'ieee_info_1'
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, "html.parser")
universities = soup.find_all('div', class_='spoName bullet pad-t15')
people = soup.find_all('div', class_='roster-results')
for u, p in zip(universities, people):
info = p.find_all('p')
university = u.get_text()
name = info[0].get_text()
if name == 'Position Vacant':
continue
title = info[2].get_text()
address = info[3].get_text() + ', ' + info[4].get_text()
email = info[-1].get_text()[7:]
content = [[university, name, title, address, email]]
list_name = ['university', 'name', 'title', 'address', 'email']
data = pd.DataFrame(columns=list_name, data=content)
data.to_csv("{}.csv".format(save_file), mode='a', index=False, header=False, encoding='utf-8')
相关标签