In [1]:
%%html
<style type='text/css'>
.CodeMirror{ font-size: 14px; font-family: callable}
</style>
In [2]:
# 라이브러리
import requests
from bs4 import BeautifulSoup
from datetime import date, timedelta
In [3]:
# 뉴스 타이틀
def get_daum_news_title(news_id):
url = 'https://news.v.daum.net/v/{}'.format(news_id)
response = requests.get(url)
status_code = response.status_code
if status_code == 200:
soup = BeautifulSoup(response.text)
title_h3 = soup.select_one('h3.tit_view')
ret_title = title_h3.text.replace('\'','').replace('"','')
else:
ret_title = 'status_code:{}'.format(status_code)
return ret_title
In [4]:
daum_news_title = get_daum_news_title(20200605110456504)
daum_news_title
Out[4]:
In [5]:
# 뉴스 내용
def get_daum_news_content(news_id):
url = 'https://news.v.daum.net/v/{}'.format(news_id)
response = requests.get(url)
status_code = response.status_code
ret_content=''
if status_code == 200:
soup = BeautifulSoup(response.text)
content = soup.select_one('div#harmonyContainer')('p')
for p in content:
ret_content += p.text
else:
ret_content = 'status_code:{}'.format(status_code)
return ret_content
In [6]:
daum_news_content = get_daum_news_content(20200605110456504)
daum_news_content
Out[6]:
In [7]:
# 댓글
# news_id = '20200605110456504'
# url = 'https://news.v.daum.net/v/{}'.format(news_id)
# response = requests.get(url)
# status_code = response.status_code
# soup = BeautifulSoup(response.text)
# 해당 방법은 오류 발생
In [8]:
# 댓글 - 현재 headers 값이 바뀌어 데이터를 긁어오지 못함, 값을 가져오기 위해서는 h
news_id = '20200605114023995'
url = 'https://comment.daum.net/apis/v1/posts/@{}/comments'.format(news_id)
params = {'parentId' : 0, 'offset' : 0, 'limit' : 10, 'sort' : 'RECOMMEND', 'isInitial' : 'true'}
headers = {'Authorization': 'Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJmb3J1bV9rZXkiOiJuZXdzIiwiZ3JhbnRfdHlwZSI6ImFsZXhfY3JlZGVudGlhbHMiLCJzY29wZSI6W10sImV4cCI6MTU5MTYyODQ0MSwiYXV0aG9yaXRpZXMiOlsiUk9MRV9DTElFTlQiXSwianRpIjoiM2NjMzljOGUtMjJjNy00ZTJhLTkzYzAtOGIwNmZmMTUyNmI4IiwiZm9ydW1faWQiOi05OSwiY2xpZW50X2lkIjoiMjZCWEF2S255NVdGNVowOWxyNWs3N1k4In0.OZ2BX0Zexm0iRM3lzMWwhMBK6CUrPasyZuTVtCjZOvo'}
response = requests.get(url, headers=headers, params = params)
status_code = response.status_code
comment_all = response.json()
In [9]:
# 댓글 하나의 내용
comment_all[0]['content']
Out[9]:
In [10]:
# 모든 댓글 내용
for item in comment_all:
print(item['content'])
In [11]:
import requests
from bs4 import BeautifulSoup
In [12]:
url = 'https://news.v.daum.net/v/20200608095807496'
response = requests.get(url)
response.status_code
soup = BeautifulSoup(response.text)
In [13]:
count = soup.select_one('em.num_count')
print(count)
In [14]:
url = 'https://comment.daum.net/apis/v1/posts/@20200608095807496'
headers = {'Authorization': 'Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJmb3J1bV9rZXkiOiJuZXdzIiwiZ3JhbnRfdHlwZSI6ImFsZXhfY3JlZGVudGlhbHMiLCJzY29wZSI6W10sImV4cCI6MTU5MTYyMjIzMiwiYXV0aG9yaXRpZXMiOlsiUk9MRV9DTElFTlQiXSwianRpIjoiZTVjZjVkZjUtMjhmNS00OGM0LTg5MzUtNTA1MzljODdlY2RmIiwiZm9ydW1faWQiOi05OSwiY2xpZW50X2lkIjoiMjZCWEF2S255NVdGNVowOWxyNWs3N1k4In0.1jj6MLkZ4IJ5A0jdRedwENGMiXCgcXIphvZ5JSIc2Ws'}
response = requests.get(url, headers=headers)
response.status_code
data = response.json()
commentCount = data['commentCount']
commentCount
Out[14]:
'학원 > Python' 카테고리의 다른 글
데이터수집 - kweather (0) | 2020.06.08 |
---|---|
데이터수집- XML - 기상청 (0) | 2020.06.08 |
데이터 수집 - 네이버 영화 순위 (0) | 2020.06.08 |
데이터수집 - 네이버 책 검색 (0) | 2020.06.08 |
스크래핑과 크롤링 (0) | 2020.06.02 |