import requests
from bs4 import BeautifulSoup
import csv
import time
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
def get_movies():
# 创建CSV文件存储数据
with open('douban_movies.csv', 'w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(["排名", "电影名", "评分", "导演", "年份", "国家", "简介"])
# 遍历Top250的10个页面(每页25条)
for page in range(0, 250, 25):
url = f"https://movie.douban.com/top250?start=%7Bpage%7D%22
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
# 解析每部电影
items = soup.find_all('div', class_='item')
for item in items:
rank = item.find('em').text
title = item.find('span', class_='title').text
rating = item.find('span', class_='rating_num').text
info = item.find('div', class_='bd').p.get_text(strip=True).split('\n')
director = info[0].split('导演: ')[1].split('主演:')[0].strip()
year_country = info[1].strip().split(' / ')
year = year_country[0]
country = year_country[1] if len(year_country) > 1 else ""
quote_tag = item.find('span', class_='inq')
quote = quote_tag.text if quote_tag else "无简介"
writer.writerow([rank, title, rating, director, year, country, quote])
time.sleep(2) # 避免高频请求
if __name__ == "__main__":
get_movies()