# -*- coding: utf-8 -*-
"""
Created on Mon Sep 9 10:55:55 2019
@author: Administrator
"""
import os
import requests
import bs4
from bs4 import BeautifulSoup
def main():
url = "http://HdhCmsTestzuihaodaxue测试数据/zuihaodaxuepaiming2019.html"
root = "d:/pictures//"
path = root + url.split("/")[-1]
writehtml(url, path)
uinfo = []
html = gethtmltext(url)
fillunivlist(uinfo, html)
printunivlist(uinfo, 50)
def writehtml(url, path):
if (not os.path.exists(path)) or os.path.getsize(path) <= 0:
r = requests.get(url)
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, "html.parser")
with open(path, "w", encoding="utf-8") as f1:
f1.write(soup.prettify())
print("保存成功")
else:
print("文件已存在")
def gethtmltext(url):
try:
r = requests.get(url)
r.encoding = r.apparent_encoding
r.raise_for_status()
return r.text
except:
return ""
def fillunivlist(ulist, html):
soup = BeautifulSoup(html, "html.parser")
for tr in soup.find("tbody").children:
# 判断tr是否是bs4定义的tag类型
if isinstance(tr, bs4.element.Tag):
# 将tr中所有的td标签变为列表
tds = tr("td")
ulist.append([tds[0].string, tds[1].string, tds[3].string])
# ulist.append({"排名": tds[0].string,
# "学校": tds[1].string,
# "分数": tds[3].string})
def printunivlist(ulist, num):
# print函数中的format格式化输出方法
# {:>10} 右对齐 不够10个字符的地方用英文空格填充
# {:0>10} 右对齐 不够10个字符的地方用0填充
# {:^10} 居中对齐 不够10个字符的地方用英文空格填充
# {:chr(12288)^10} 居中对齐 不够10个字符的地方用中文空格填充
# {:<10} 左对齐
#print("{:>5}{:>20}{:>5}".format("排名", "学校", "分数"))
tl = "{0:^10}\t{1:{3}^10}\t{2:<4}"
# chr(12288)为中文空格,用中文空格填充中文输出列,可以使排版整齐
print(tl.format("排名", "学校名称", "分数", chr(12288)))
for ul in ulist[:num]:
print(tl.format(ul[0], ul[1], ul[2], chr(12288)))
# for dic in ulist[:num]:
# print("{:^5}{:^20}{:^8}".format(dic["排名"], dic["学校"], dic["分数"]))
if __name__ == "__main__":
main()
查看更多关于python bs4+requests 实现大学排名爬取的详细内容...
声明:本文来自网络,不代表【好得很程序员自学网】立场,转载请注明出处:http://haodehen.cn/did171696