본문 바로가기

Crawling

urllib, BeautifulSoup 으로 잡코리아 크롤링

import urllib

from bs4 import BeautifulSoup as bs

import pandas as pd
base_url = 'http://www.jobkorea.co.kr/Search/?stext={}&tabTy
def crawl(keyword, page_num):
    keyword = urllib.parse.quote(keyword)
    url = base_url.format(keyword,page_num)
    response = urllib.request.urlopen(url)
    soup = bs(response,'html.parser')
    name = [element.text for element in soup.select("a.name")][:19]
    detail = [element.text for element in soup.select("div.post-list-info > a.title")][:19]
    detail = [element.replace("\n","").replace("\r","") for element in detail]
    df = pd.DataFrame({'기업 이름' : name, '자세한 내용' : detail})
    return df
crawl("데이터",1)
df_list = [crawl("데이터", page) for page in range(1,10)]
df = df_list[0]
for i in range(len(df_list)-1):
    df = df.append(df_list[i+1])
df
df.to_excel("recuriut({}).xlsx", encoding='utf-8-sig')