본문 바로가기

학원/Python

데이터수집 - 네이버 책 검색

NAVER_BOOK

네이버 책 검색기

In [1]:
%%html
<style type='text/css'>
.CodeMirror{ font-size: 14px; font-family: callable}
</style>
In [2]:
# 라이브러리
import requests
from bs4 import BeautifulSoup
In [3]:
# 주소
url = 'https://book.naver.com/search/search.nhn'
params = {'sm' : 'sta_hty.book',
              'sug' : ' ',
              'where' : 'nexearch',
              'query' : 'bigdata'}
In [4]:
# get 요청
response = requests.get(url, params=params)
status_code = response.status_code
print(status_code)

if status_code == 200:
    text = response.text    
200
In [5]:
# str ==> BeautifulSoup
soup = BeautifulSoup(text)
In [6]:
# 책 전체정보 가져오기 (크롬 개발자 도구)
book_all = soup.select_one('ul#searchBiblioList')  # = soup.find(id='searchBiblioList') = soup.find(attrs={'id'='searchBiblioList')
#book_all
In [7]:
# 책 정보 가져오기
book_all_li_one = book_all.select_one('li') # 책 한권
book_all_li_all = book_all.select('li') # 책 각각 전체
In [8]:
# 책 bid 한개 가져오기
bid_one = book_all_li_one.select_one('a')['href'].split('=')[1] # 책 한권
bid_one
Out[8]:
'15516543'
In [9]:
# 책 bid 모두 가져오기
bid_list = []
for item in book_all_li_all:
    b_id = item.select_one('a')['href'].split('=')[1]
    bid_list.append(b_id)
print(bid_list)
['15516543', '13587569', '13783099', '16338249', '14594752', '13784550', '16327795', '16346530', '13399152', '10390764', '16276774', '15748262', '13409559', '15746028', '15744672', '16113809', '10220466', '15811120', '7185569', '15136965']
In [10]:
# 책 제목 한권 가져오기
book_image = book_all_li_one.select_one('img')
book_name_one = book_image['alt']
book_name_one
Out[10]:
'KNIME을 활용한 Big Data분석'
In [11]:
# 책 제목 모두 가져오기
title_list = []
for item in book_all_li_all:
    book_image = item.select_one('img')
    b_name = book_image['alt']
    title_list.append(b_name)
#title_list
In [12]:
# 책 한 권 저자, 출판사, 출판일

book_info = book_all_li_one.select_one('dd.txt_block')
book_text = book_info.text.replace('\n',' ').replace('\xa0',' ')
book_text = book_info.text.replace('\t',' ').replace('\r',' ')
book_text_list = book_text.split('|')
book_author = book_text_list[0]
book_publish = book_text_list[1]
book_pubdate = book_text_list[2]
print(book_author,book_publish,book_pubdate)
조치선, 정영진 외 5명 저   엑셈   2019.09.25
In [13]:
# 모든 책 저자, 출판사, 출판일
author_list = []
publish_list = []
pubdate_list = []

for item in book_all_li_all:
    book_info = item.select_one('dd.txt_block')
    book_text = book_info.text.replace('\n','').replace('\xa0','')
    book_text = book_text.replace('\r','').replace('\t','')
    book_text_list = book_text.split('|')
    if(len(book_text_list)) == 4:
        book_author = book_text_list[0] + book_text_list[1] 
        book_publish = book_text_list[2]
        book_pubdate = book_text_list[3]
    else:
        book_author = book_text_list[0]
        book_publish = book_text_list[1]
        book_pubdate = book_text_list[2]
    author_list.append(book_author)
    publish_list.append(book_publish)
    pubdate_list.append(book_pubdate)
    
print(author_list)
print(publish_list)
print(pubdate_list)
    
['조치선, 정영진 외 5명 저 ', '안지선 글  송진욱 그림', '김진호(대학교수), 최용주(대학부총장) 저 ', '이범식 김은주 전소현 이상범 저 ', '편집부 저 ', '박형준 저 ', 'Petr Skoda 저 ', 'Vikas Khare 저 ', 'Stephens-davidowitz, Seth 저 ', '버나드 마 저  Ann Lee 역', 'Dinesh Peter 저 ', 'Pedersen, John S. (EDT), Wilkinson, Adrian (EDT) 저 ', '조명화(여행작가) 저 ', 'Sarangi, Saswat, Sharma, Pankaj 저 ', 'Sarangi, Saswat, Sharma, Pankaj 저 ', 'Soraya Sedkaoui 저 ', '이랑(가수), 황국영 저 ', 'Zhihua Zhang 저 ', '빅토어 마이어 쇤베르거(대학교수) 저 ', 'Zgurovsky, Michael Z., Zaychenko, Yuriy P. 저 ']
['엑셈', '봄볕', '북카라반', '구민사', 'IRSGlobal', '리드리드출판', 'Elsevier', 'Elsevier', 'DeyStreetBooks', '교학사', 'Elsevier', 'EdwardElgarPub', '테마여행신문TTNThemeTravelNewsKorea', 'RoutledgeIndia', 'RoutledgeIndia', 'Wiley-ISTE', '소시민워크', 'Elsevier', 'JohnMurrayPublishers', 'Springer-NatureNewYorkInc']
['2019.09.25', '2018.05.16', '2018.07.25', '2020.04.20', '2019.02.26', '2018.07.27', '2020.04.22', '2020.04.21', '2018.02.27', '2016.03.20', '2020.03.09', '2019.11.29', '2018.03.03', '2019.10.05', '2019.10.03', '2020.01.09', '2016.02.01', '2019.12.03', '2013.03.01', '2019.07.05']
In [14]:
# 책 한권 가격
book_txt_desc = book_all_li_all[0].select_one('dd.txt_desc')
price_old = book_txt_desc.select_one('strike').text
price_old = price_old.split('원')[0]
price_new= book_txt_desc.select_one('em.price').text
price_new = price_new.split('원')[0]
print(price_old, price_new)
25000 22500
In [15]:
# 오류문 확인
book_txt_desc = book_all_li_all[4].select_one('dd.txt_desc')
price_old = book_txt_desc.select_one('strike')
price_new = book_txt_desc.select_one('em.price')

if price_old == None:
    price_old = 0
else :
    price_old = price_old.splite('원')[0]
if price_new == None:
    price_new = 0
else :
    price_new = price_new.splite('원')[0]    

print(price_old, price_new)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-15-0a0c319076c9> in <module>
      7     price_old = 0
      8 else :
----> 9     price_old = price_old.splite('원')[0]
     10 if price_new == None:
     11     price_new = 0

TypeError: 'NoneType' object is not callable
In [16]:
# 전체 책 가격
price_list = []
for item in book_all_li_all:
    book_txt_desc = item.select_one('dd.txt_desc')
    price_old = book_txt_desc.select_one('strike')
    price_new = book_txt_desc.select_one('em.price')

    if price_old == None:
        price_old = 0
    else :
        price_old = price_old.text.split('원')[0]
        
    if price_new == None:
        price_new = 0
    else :
        price_new = price_new.text.split('원')[0]    
    price_list.append((price_old, price_new))
    
price_list
Out[16]:
[('25000', '22500'),
 ('13000', '11700'),
 ('16000', '14400'),
 ('21000', '20370'),
 ('390000', '351000'),
 ('15800', '14220'),
 (0, 0),
 (0, 0),
 ('22220', '13880'),
 ('14000', '12600'),
 (0, 0),
 ('189540', '188320'),
 (0, 0),
 ('69650', '57120'),
 ('196560', '177940'),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 ('214180', '190630')]
In [17]:
book_info_list = []
for i in range(len(bid_list)):
    book_info_dict = dict()
    book_info_dict['bid'] = bid_list[i]
    book_info_dict['title'] = title_list[i]
    book_info_dict['author'] = author_list[i]
    book_info_dict['publish'] = publish_list[i]
    book_info_dict['pubdate'] = pubdate_list[i]
    book_info_dict['price_old_new'] = price_list[i]
    book_info_list.append(book_info_dict)
#book_info_list
In [18]:
book_info_list[1]
Out[18]:
{'bid': '13587569',
 'title': '빅데이터',
 'author': '안지선 글  송진욱 그림',
 'publish': '봄볕',
 'pubdate': '2018.05.16',
 'price_old_new': ('13000', '11700')}

'학원 > Python' 카테고리의 다른 글

데이터 수집 - 다음뉴스  (0) 2020.06.08
데이터 수집 - 네이버 영화 순위  (0) 2020.06.08
스크래핑과 크롤링  (0) 2020.06.02
tinyDB 설치와 조작  (0) 2020.06.02
로또 번호 가져오기  (0) 2020.06.02