2.5 KiB
import pandas as pd import seaborn as sns from wordcloud import WordCloud import matplotlib.pyplot as plt from IPython.display import display, Markdown from collections import Counter from itertools import islice
articles = pd.read_csv('data/articles.csv')# how-gen-ai-affects-researchers
Most common tags
man_tags = articles['Manual Tags'].dropna().str.lower() auto_tags = articles['Automatic Tags'].dropna().str.lower() tags = man_tags.str.split(';').explode().str.strip().to_list() + auto_tags.str.split(';').explode().str.strip().to_list() c = Counter(tags)
text = '## Top 50 common tags:\n\n' for val, key in c.most_common(50): text += f"1. {val} ({key})\n" display(Markdown(text))
wc = WordCloud(scale=8, background_color="white").generate_from_frequencies(c) plt.figure(figsize=(10,10)) plt.imshow(wc) plt.axis("off") plt.show()
Title
titles = articles['Title'].dropna().str.lower()
tc = WordCloud(scale=8, background_color="white").generate(" ".join(titles)) plt.figure(figsize=(10,10)) plt.imshow(tc) plt.axis("off") plt.show()
top_title = "### Top 50 phrase in title\n\n"
tc_words = dict(islice(dict(sorted(WordCloud(scale=8, background_color="white").process_text(" ".join(titles)).items(), key=lambda item: item[1], reverse=True)).items(), 50))
for word, freq in tc_words.items(): top_title += f"1. {word} ({freq})\n"
display(Markdown(top_title))
Abstract
abstracts = articles['Abstract Note'].dropna().str.lower()
ac = WordCloud(scale=8, background_color="white").generate(" ".join(abstracts)) plt.figure(figsize=(10,10)) plt.imshow(ac) plt.axis("off") plt.show()
top_abstract = "### Top 50 phrase in abstract\n\n"
ac_words = dict(islice(dict(sorted(WordCloud(scale=8, background_color="white").process_text(" ".join(abstracts)).items(), key=lambda item: item[1], reverse=True)).items(), 50))
for word, freq in ac_words.items(): top_abstract += f"1. {word} ({freq})\n"
display(Markdown(top_abstract))
Journal
pub = articles['Publication Title'].dropna().str.lower()
pc = WordCloud(scale=8, background_color="white").generate(" ".join(pub)) plt.figure(figsize=(10,10)) plt.imshow(pc) plt.axis("off") plt.show()
top_pub = "### Top 50 phrase in publication title\n\n"
pc_words = dict(islice(dict(sorted(WordCloud(scale=8, background_color="white").process_text(" ".join(pub)).items(), key=lambda item: item[1], reverse=True)).items(), 50))
for word, freq in pc_words.items(): top_pub += f"1. {word} ({freq})\n"
display(Markdown(top_pub))