1.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 | import selenium from selenium import webdriver import random from time import sleep from selenium.webdriver.common.keys import Keys from bs4 import BeautifulSoup as bs import json def get_url(): global browser keyword = input('') url = 'https://www.instagram.com/explore/tags/{}/?hl=ko'.format(keyword) browser = webdriver.Chrome() browser.get(url) return browser def click_first_photo(): browser = get_url() photo_click = browser.find_element_by_xpath('//*[@id="react-root"]/section/main/article/div[1]/div/div/div[1]/div[1]/a/div/div[2]') photo_click.click() return None def get_post(): post_dct = {} browser.implicitly_wait(5) post_tags = browser.find_element_by_class_name('C4VMK') post_dct['post_contents'] = post_tags.find_element_by_tag_name('a').text post_dct['post_accounts'] = post_tags.find_element_by_tag_name('span').text post_dct['post_date'] = post_tags.find_element_by_tag_name('time').get_attribute('title') return post_dct def get_comments(): comments_list = [] browser.implicitly_wait(5) comments = browser.find_elements_by_class_name('C4VMK')[1:] for i in range(len(comments)): comments_dct = {} comments_dct['comment{}_accounts'.format(i+1)] = comments[i].find_element_by_tag_name('a').text comments_dct['comment{}_contents'.format(i+1)] = comments[i].find_element_by_tag_name('span').text comments_list.append(comments_dct) if len(comments_list) == 0: print('--> no_comments') # comment가 없을 경우 출력 return comments_list def get_info(): return {'post': get_post(), 'comments': get_comments()} def click_next_photo(path = '/html/body/div[4]/div[1]/div/div/a[2]'): click_next = browser.find_element_by_xpath(path) click_next.click() return None def export_result_json(data): with open('instagram_crawling_results.json', 'w', encoding = 'UTF-8') as f: json.dump(data, f, indent = 4) def instagram_crawler(n_iter = 50): click_first_photo() res = [] for i in range(n_iter): print('Post{}'.format(i+1)) try: info = get_info() res.append(info) except Exception as e: # 게시글 가져오지 못할 경우 예외처리 print(e) if i == 0: click_next_photo('/html/body/div[4]/div[1]/div/div/a') # 첫번째 게시글의 경우 다음 버튼의 path를 다르게 설정 else: click_next_photo() sleep(random.uniform(3,5)) export_result_json(res) instagram_crawler() # iteration = 50 | cs |
Designed by sketchbooks.co.kr / sketchbook5 board skin
Sketchbook5, 스케치북5
Sketchbook5, 스케치북5
Sketchbook5, 스케치북5
Sketchbook5, 스케치북5