95 lines
2.5 KiB
Markdown
95 lines
2.5 KiB
Markdown
import pandas as pd
|
|
import seaborn as sns
|
|
from wordcloud import WordCloud
|
|
import matplotlib.pyplot as plt
|
|
from IPython.display import display, Markdown
|
|
from collections import Counter
|
|
from itertools import islice
|
|
|
|
articles = pd.read_csv('data/articles.csv')# how-gen-ai-affects-researchers
|
|
|
|
## Most common tags
|
|
|
|
man_tags = articles['Manual Tags'].dropna().str.lower()
|
|
auto_tags = articles['Automatic Tags'].dropna().str.lower()
|
|
tags = man_tags.str.split(';').explode().str.strip().to_list() + auto_tags.str.split(';').explode().str.strip().to_list()
|
|
c = Counter(tags)
|
|
|
|
text = '## Top 50 common tags:\n\n'
|
|
for val, key in c.most_common(50):
|
|
text += f"1. {val} ({key})\n"
|
|
display(Markdown(text))
|
|
|
|
wc = WordCloud(scale=8, background_color="white").generate_from_frequencies(c)
|
|
plt.figure(figsize=(10,10))
|
|
plt.imshow(wc)
|
|
plt.axis("off")
|
|
plt.show()
|
|
|
|
## Title
|
|
|
|
titles = articles['Title'].dropna().str.lower()
|
|
|
|
tc = WordCloud(scale=8, background_color="white").generate(" ".join(titles))
|
|
plt.figure(figsize=(10,10))
|
|
plt.imshow(tc)
|
|
plt.axis("off")
|
|
plt.show()
|
|
|
|
---
|
|
|
|
top_title = "### Top 50 phrase in title\n\n"
|
|
|
|
tc_words = dict(islice(dict(sorted(WordCloud(scale=8, background_color="white").process_text(" ".join(titles)).items(), key=lambda item: item[1], reverse=True)).items(), 50))
|
|
|
|
for word, freq in tc_words.items():
|
|
top_title += f"1. {word} ({freq})\n"
|
|
|
|
display(Markdown(top_title))
|
|
|
|
---
|
|
|
|
## Abstract
|
|
|
|
abstracts = articles['Abstract Note'].dropna().str.lower()
|
|
|
|
ac = WordCloud(scale=8, background_color="white").generate(" ".join(abstracts))
|
|
plt.figure(figsize=(10,10))
|
|
plt.imshow(ac)
|
|
plt.axis("off")
|
|
plt.show()
|
|
|
|
---
|
|
|
|
top_abstract = "### Top 50 phrase in abstract\n\n"
|
|
|
|
ac_words = dict(islice(dict(sorted(WordCloud(scale=8, background_color="white").process_text(" ".join(abstracts)).items(), key=lambda item: item[1], reverse=True)).items(), 50))
|
|
|
|
for word, freq in ac_words.items():
|
|
top_abstract += f"1. {word} ({freq})\n"
|
|
|
|
display(Markdown(top_abstract))
|
|
|
|
---
|
|
|
|
## Journal
|
|
|
|
pub = articles['Publication Title'].dropna().str.lower()
|
|
|
|
pc = WordCloud(scale=8, background_color="white").generate(" ".join(pub))
|
|
plt.figure(figsize=(10,10))
|
|
plt.imshow(pc)
|
|
plt.axis("off")
|
|
plt.show()
|
|
|
|
---
|
|
|
|
top_pub = "### Top 50 phrase in publication title\n\n"
|
|
|
|
pc_words = dict(islice(dict(sorted(WordCloud(scale=8, background_color="white").process_text(" ".join(pub)).items(), key=lambda item: item[1], reverse=True)).items(), 50))
|
|
|
|
for word, freq in pc_words.items():
|
|
top_pub += f"1. {word} ({freq})\n"
|
|
|
|
display(Markdown(top_pub))
|