defgetNewsDetail(newsurl): result={} res=s.get(newsurl,headers=header) res.encoding='utf-8' soup=bs(res.text,'html.parser') result['Title']=soup.select('.text-center')[0].text article=[] for p in soup.select('.singleinfo p'): article.append(p.text.strip()) result['article']=article[0] downloadurl=[] filename=[] Docurl=soup.find_all(href=re.compile("attachment")) for k in Docurl: downloadurl.append('http://my.bupt.edu.cn/'+k.get('href')) filename.append(k.string) if filename: for k in range(0,len(filename)): download=s.get(downloadurl[k],headers=header) with open(folder_path+filename[k],"wb") as f: f.write(download.content) f.close() return result
news_total=[] for i in range(0,29): if date[i]!=today+' ': continue newsary=getNewsDetail(url[i]) news_total.append(newsary) df=pd.DataFrame(news_total) df.to_excel(folder_path+'news.xlsx')
五、整体代码
# -*- coding: utf-8 -*- """ Created on Fri Jul 5 16:49:28 2019 @author: byrwyj """
import requests import http.cookiejar as cookielib from bs4 import BeautifulSoup as bs import re import pandas as pd import os import datetime today = datetime.date.today().isoformat() folder_path = 'C:/Users/john/OneDrive/桌面/' + today +"/" ifnot os.path.exists(folder_path): os.makedirs(folder_path)
defgetLt(str): lt=bs(str,'html.parser') dic={} for inp in lt.form.find_all('input'): if(inp.get('name'))!=None: dic[inp.get('name')]=inp.get('value') return dic
header={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' }
defgetNewsDetail(newsurl): result={} res=s.get(newsurl,headers=header) res.encoding='utf-8' soup=bs(res.text,'html.parser') result['Title']=soup.select('.text-center')[0].text article=[] for p in soup.select('.singleinfo p'): article.append(p.text.strip()) result['article']=article[0] downloadurl=[] filename=[] Docurl=soup.find_all(href=re.compile("attachment")) for k in Docurl: downloadurl.append('http://my.bupt.edu.cn/'+k.get('href')) filename.append(k.string) if filename: for k in range(0,len(filename)): download=s.get(downloadurl[k],headers=header) with open(folder_path+filename[k],"wb") as f: f.write(download.content) f.close() return result
response=s.post('https://auth.bupt.edu.cn/authserver/login?service=http%3A%2F%2Fmy.bupt.edu.cn%2Findex.portal',data=postdata,headers=header) res=s.get('http://my.bupt.edu.cn/index.portal?.pn=p1778',headers=header) soup=bs(res.text,'html.parser') news_total=[] date=[] url=[] for j in soup.find_all(href=re.compile("detach")): url.append('http://my.bupt.edu.cn/'+j.get('href')) for j in soup.find_all(class_='time'): date.append(j.string) for i in range(0,29): if date[i]!=today+' ': continue newsary=getNewsDetail(url[i]) news_total.append(newsary) df=pd.DataFrame(news_total) df.to_excel(folder_path+'news.xlsx')