투빅스 11기&12기 6주차 크롤링 - 이유진

by yooj_lee posted Sep 04, 2019
?

단축키

Prev이전 문서

Next다음 문서

ESC닫기

+ - Up Down Comment Print

1. 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import selenium 
from selenium import webdriver
import random
from time import sleep
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as bs
import json
 
def get_url():
    global browser
    keyword = input('')
    url = 'https://www.instagram.com/explore/tags/{}/?hl=ko'.format(keyword)
    browser = webdriver.Chrome()
    browser.get(url)
    return browser
 
def click_first_photo():
    browser = get_url()
    photo_click = browser.find_element_by_xpath('//*[@id="react-root"]/section/main/article/div[1]/div/div/div[1]/div[1]/a/div/div[2]')
    photo_click.click()
    return None
 
def get_post():
    post_dct = {}
    browser.implicitly_wait(5)
    post_tags = browser.find_element_by_class_name('C4VMK')
    post_dct['post_contents'= post_tags.find_element_by_tag_name('a').text
    post_dct['post_accounts'= post_tags.find_element_by_tag_name('span').text
    post_dct['post_date'= post_tags.find_element_by_tag_name('time').get_attribute('title')
    return post_dct
 
def get_comments():
    comments_list = []
    browser.implicitly_wait(5)
    comments = browser.find_elements_by_class_name('C4VMK')[1:]
    for i in range(len(comments)):
        comments_dct = {}
        comments_dct['comment{}_accounts'.format(i+1)] = comments[i].find_element_by_tag_name('a').text
        comments_dct['comment{}_contents'.format(i+1)] = comments[i].find_element_by_tag_name('span').text
        comments_list.append(comments_dct)
    if len(comments_list) == 0:
        print('--> no_comments'# comment가 없을 경우 출력
    return comments_list
 
def get_info():
    return {'post': get_post(), 'comments': get_comments()}
 
 
def click_next_photo(path = '/html/body/div[4]/div[1]/div/div/a[2]'):
    click_next = browser.find_element_by_xpath(path)
    click_next.click()
    return None
 
def export_result_json(data):
    with open('instagram_crawling_results.json''w', encoding = 'UTF-8') as f:
        json.dump(data, f, indent = 4)
 
def instagram_crawler(n_iter = 50):
    click_first_photo()
    res = []
    for i in range(n_iter):
        print('Post{}'.format(i+1))
        try:
            info = get_info()
            res.append(info)
        except Exception as e: # 게시글 가져오지 못할 경우 예외처리
            print(e)
        if i == 0:
            click_next_photo('/html/body/div[4]/div[1]/div/div/a'# 첫번째 게시글의 경우 다음 버튼의 path를 다르게 설정
        else:
            click_next_photo()
        sleep(random.uniform(3,5))
    export_result_json(res)
 
instagram_crawler() # iteration = 50
 
 
cs

Articles

2 3 4 5 6 7 8 9 10 11

나눔글꼴 설치 안내


이 PC에는 나눔글꼴이 설치되어 있지 않습니다.

이 사이트를 나눔글꼴로 보기 위해서는
나눔글꼴을 설치해야 합니다.

설치 취소

Designed by sketchbooks.co.kr / sketchbook5 board skin

Sketchbook5, 스케치북5

Sketchbook5, 스케치북5

Sketchbook5, 스케치북5

Sketchbook5, 스케치북5