import requests

from bs4 import BeautifulSoup

import csv

import time

headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"

}

def get_movies():

# 创建CSV文件存储数据

with open('douban_movies.csv', 'w', newline='', encoding='utf-8') as file:

writer = csv.writer(file)

writer.writerow(["排名", "电影名", "评分", "导演", "年份", "国家", "简介"])

# 遍历Top250的10个页面(每页25条)

for page in range(0, 250, 25):

url = f"https://movie.douban.com/top250?start=%7Bpage%7D%22

response = requests.get(url, headers=headers)

soup = BeautifulSoup(response.text, 'html.parser')

# 解析每部电影

items = soup.find_all('div', class_='item')

for item in items:

rank = item.find('em').text

title = item.find('span', class_='title').text

rating = item.find('span', class_='rating_num').text

info = item.find('div', class_='bd').p.get_text(strip=True).split('\n')

director = info[0].split('导演: ')[1].split('主演:')[0].strip()

year_country = info[1].strip().split(' / ')

year = year_country[0]

country = year_country[1] if len(year_country) > 1 else ""

quote_tag = item.find('span', class_='inq')

quote = quote_tag.text if quote_tag else "无简介"

writer.writerow([rank, title, rating, director, year, country, quote])

time.sleep(2) # 避免高频请求

if __name__ == "__main__":

get_movies()

Reply to this note

Please Login to reply.

Discussion

No replies yet.