N배 빠른 네이버 뉴스 본문 크롤링

동적 크롤링 방법은 아래를 참고하자. 필요할때 필요한 모듈을 찾아쓰면 된다.
https://m.blog.naver.com/21ahn/221329219163
https://www.slideshare.net/wangwonLee/2018-datayanolja-moreeffectivewebcrawling
네이버 인기뉴스 크롤링은 기본적인 정적 크롤링으로 충분하다.

속도가 너무 느려서, thread pool을 사용한 크롤링으로 한다..
https://beomi.github.io/2017/07/05/HowToMakeWebCrawler-with-Multiprocess/

아래 디자인으로 블로깅하였다.
https://colorscripter.com/

[403] forbidden 오류로 막히면
import requests from bs4 import BeautifulSoup headers = {'User-Agent': 'Mozilla/5.0'} URL = 'http://www.naver.com' res = requests.get(URL, headers=headers) soup = BeautifulSoup(res.text) print(soup)


헤더는
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
headers = {'User-Agent':'Chrome/66.0.3359.181'}
headers = {'User-Agent':'Mozilla/5.0', 'referer' : 'http://www.naver.com'}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import datetime
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook
from multiprocessing import Pool
import pandas as pd
section_id = {100:"정치",101:"경제",102:"사회",103:"생활/문화",104:"세계",105:"IT/과학"}
text_news_count = 30
days_range = []
start = datetime.datetime.strptime("2015-08-01""%Y-%m-%d")
end = datetime.datetime.strptime("2016-08-01""%Y-%m-%d"# 범위 + 1
date_generated = [start + datetime.timedelta(days=x) for x in range(0, (end-start).days)]
for date in date_generated:
    days_range.append(date.strftime("%Y-%m-%d"))
    
#return days_range
def get_bs_obj(url):
    result = requests.get(url)
    bs_obj = BeautifulSoup(result.content, "html.parser")
    
    return bs_obj
def get_links():
    links = []
    count = 1
    for date in days_range:
        for section in range(100,106):
            news_arrange_url = "https://news.naver.com/main/ranking/popularDay.nhn?rankingType=popular_day"
            news_list_date_page_url = news_arrange_url+"&sectionId="+str(section)+"&date="+date.replace("-","")
            # get bs_obj
            bs_obj = get_bs_obj(news_list_date_page_url)
    
            a = bs_obj.find("div",{"class":"content"}).find("div",{"class":"ranking"}).findAll("a")
            ranking_list = set()
            for i in a:
                ranking_list.add(i["href"])
            for rank_URL in ranking_list:
                links.append([section,"https://news.naver.com"+rank_URL,count])
                count += 1
    return links
        
def get_content(link):
    bs_obj = get_bs_obj(link[1])
    main_news = []
    
    try:
        title = bs_obj.find("h3",{"id":"articleTitle"}).text
        body = bs_obj.find("div",{"class":"_article_body_contents"}).get_text("\n").replace("// flash 오류를 우회하기 위한 함수 추가","").replace("function _flash_removeCallback() {}","")
        aid = bs_obj.find("div",{"class":"article_header"}).find("img")["title"]
        main_news = [date,section_id[link[0]],title,body,link[1],aid]
    except:
        a=1
    return main_news
        
if __name__ == '__main__':
    main_news_list = []
    
    pool = Pool(processes=4)
    main_news_list.append(pool.map(get_content, get_links()))
            
    # make .xml file
    naver_news_df = pd.DataFrame(main_news_list[0], 
                                          columns = ["date""section""title""body""link","신문사"])
    writer = pd.ExcelWriter('150801-160817_news.xlsx', options={'strings_to_urls': False})
    naver_news_df.to_excel(writer,'Sheet')
    writer.save()
    
cs

댓글

가장 많이 본 글