[파이썬] 영어제목의 키워드 분석하기(+워드클라우드)📰

hyerimmy 2021. 2. 17. 23:20

2021.02.16
PM 22:00 - 23:15

!pip install matplotlib
!pip install wordcloud

import pandas as pd
import glob
import re
from functools import reduce
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import STOPWORDS, WordCloud

#10개의 엑셀 파일 이름을 all_files 리스트에 저장

all_files = glob.glob('8-1_data\\myCabinetExcelData*.xls') #myCabinetExcelData로 시작하는 파일들의 <<이름>> 모두 리스트에 저장
all_files #출력하여 내용 확인

#all_files 리스트의 파일 이름 이용 - 파일 내용을 all_files_data에 추가

all_files_data = [] #저장할리스트
for file in all_files:
    data_frame = pd.read_excel(file) #엑셀파일읽어오고
    all_files_data.append(data_frame) #all_files_data에 파일내용추가
all_files_data[0]

#all_files_data를 (세로축기준) 모두 하나로 병합해 all_files_data_concat리스트

all_files_data_concat = pd.concat(all_files_data, axis=0,
                                 ignore_index = True) #axis=0 : 세로축 기준 병합
all_files_data_concat #출력하여 내용 확인 (모든 데이터 총합)

#all_files_data_concat(전체데이터)의 제목 추출해 all_title에 저장

all_title = all_files_data_concat['제목']
all_title

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

#전처리를 위한 준비

stopWords = set(stopwords.words("english")) #영어불용어를 불러와 저장
lemma = WordNetLemmatizer() #표제어 추출 작업 제공하는 객체 생성

words = []

for title in all_title:
    EnWords = re.sub(r"[^a-zA-Z]+", " ", str(title)) #알파벳으로 시작하지 않는 단어는 공백으로 치환해 제거
    EnWordsToken = word_tokenize(EnWords.lower()) #소문자로 정규화
    EnWordsTokenStop = [w for w in EnWordsToken if w not in stopWords] #단어 토큰화, 불용어 제거
    EnWordsTokenStopLemma = [lemma.lemmatize(w) for w in EnWordsTokenStop] #펴제어 추출
    words.append(EnWordsTokenStopLemma)
    
print(words) #출력하여 확인

words2 = list(reduce(lambda x, y:x+y, words))
print(words2)

# 데이터 탐색 - 단어 빈도 구하기 - (1)
count = Counter(words2) #출현 횟수 계산해 딕셔너리 객체 count 생성
count #출력하여 내용 확인

# 데이터 탐색 - 단어 빈도 구하기 - (2)
word_count = dict()

for tag, counts in count.most_common(50): #출현 횟수가 가장 많은 상위 50개 단어 중
    if(len(str(tag))>1): #단어 길잉가 1보다 큰 것
        word_count[tag] = counts #word_count 딕셔너리에 저장
        print("%s : %d" % (tag, counts)) #출력하여 확인

# 데이터 탐색 - 히스토그램 그리기

sorted_Keys = sorted(word_count, key=word_count.get, reverse=True)
sorted_Values = sorted(word_count.values(), reverse=True)
plt.bar(range(len(word_count)), sorted_Values, align='center')
plt.xticks(range(len(word_count)), list(sorted_Keys), rotation='85')
plt.show()

#결과 시각화 - 그래프 그리기
##(1) 연도별 학술문서 수를 추출

all_files_data_concat['doc_count']=0
summary_year = all_files_data_concat.groupby('출판일', as_index=False)['doc_count'].count()
summary_year

#결과 시각화 - 그래프 그리기
##(2) 연도별 학술문서 수를 래프로 그리기

plt.figure(figsize = (12, 5))
plt.xlabel("year")
plt.ylabel("doc-count")
plt.grid(True)
plt.plot(range(len(summary_year)), summary_year['doc_count'])
plt.xticks(range(len(summary_year)), [text for text in summary_year['출판일']])
plt.show()

#결과 시각화 - 워드클라우드
##(1) 워드클라우드 생성 및 보이기

stopwords = set(STOPWORDS) #불용어 설정
wc = WordCloud(background_color = 'ivory', stopwords = stopwords,
              width = 800, height = 600) #워드 클라우드 객체 생성
cloud = wc.generate_from_frequencies(word_count) #word_count 데이터를 담아 cloud 객체 생성

#matplotlib.pyplot을 사용해 생성한 워드클라우드 객체 보이기
plt.figure(figsize = (8,8))
plt.imshow(cloud)
plt.axis('off')
plt.show()

#결과 시각화 - 워드클라우드
##(2) 이미지로 저장
cloud.to_file("8-1_data\\riss_bigdata_wordCloud.jpg")