1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
| import requests from bs4 import BeautifulSoup
def get_movies(): headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0','Host':'movie.douban.com'} movie_list1=[] movie_list2=[] movie_list3=[] movie_list4=[] for i in range(0,10): link='https://movie.douban.com/top250?start='+str(i*25) r=requests.get(link,headers=headers,timeout=10) print(str(i+1),'页响应状态码',r.status_code)
soup=BeautifulSoup(r.text,'lxml') for t in soup.find_all('div', 'hd'): name = t.find('span', 'title').get_text() movie_list1.append(name) for t in soup.find_all('div', 'hd'): name = t.find('span', 'other').get_text() movie_list2.append(name) for t in soup.find_all('div', 'info'): info = t.find('p').get_text().replace(' ','') movie_list3.append(info) for t in soup.find_all('div', 'star'): fen = t.find('span', 'rating_num').get_text() movie_list4.append(fen) return movie_list1,movie_list2,movie_list3,movie_list4 a,b,c,d=get_movies() def main(): import codecs with codecs.open('c:\\Users\\lin paddy\\Desktop\\python\\Python爬虫\\豆瓣电影.txt', 'wb',encoding='utf-8') as f: f.write('豆瓣电影 Top 250\n') for n in range(0,250): f.write('Top'+str(n+1)+'\n') f.write('电影名:'+a[n]+b[n]+c[n]+'豆瓣评分'+d[n]) f.write('\n\n') if __name__ == '__main__': main()
|