how-gen-ai-affects-researchers/README.md
2024-12-04 16:52:06 +07:00

95 lines
2.5 KiB
Markdown

import pandas as pd
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from IPython.display import display, Markdown
from collections import Counter
from itertools import islice
articles = pd.read_csv('data/articles.csv')# how-gen-ai-affects-researchers
## Most common tags
man_tags = articles['Manual Tags'].dropna().str.lower()
auto_tags = articles['Automatic Tags'].dropna().str.lower()
tags = man_tags.str.split(';').explode().str.strip().to_list() + auto_tags.str.split(';').explode().str.strip().to_list()
c = Counter(tags)
text = '## Top 50 common tags:\n\n'
for val, key in c.most_common(50):
text += f"1. {val} ({key})\n"
display(Markdown(text))
wc = WordCloud(scale=8, background_color="white").generate_from_frequencies(c)
plt.figure(figsize=(10,10))
plt.imshow(wc)
plt.axis("off")
plt.show()
## Title
titles = articles['Title'].dropna().str.lower()
tc = WordCloud(scale=8, background_color="white").generate(" ".join(titles))
plt.figure(figsize=(10,10))
plt.imshow(tc)
plt.axis("off")
plt.show()
---
top_title = "### Top 50 phrase in title\n\n"
tc_words = dict(islice(dict(sorted(WordCloud(scale=8, background_color="white").process_text(" ".join(titles)).items(), key=lambda item: item[1], reverse=True)).items(), 50))
for word, freq in tc_words.items():
top_title += f"1. {word} ({freq})\n"
display(Markdown(top_title))
---
## Abstract
abstracts = articles['Abstract Note'].dropna().str.lower()
ac = WordCloud(scale=8, background_color="white").generate(" ".join(abstracts))
plt.figure(figsize=(10,10))
plt.imshow(ac)
plt.axis("off")
plt.show()
---
top_abstract = "### Top 50 phrase in abstract\n\n"
ac_words = dict(islice(dict(sorted(WordCloud(scale=8, background_color="white").process_text(" ".join(abstracts)).items(), key=lambda item: item[1], reverse=True)).items(), 50))
for word, freq in ac_words.items():
top_abstract += f"1. {word} ({freq})\n"
display(Markdown(top_abstract))
---
## Journal
pub = articles['Publication Title'].dropna().str.lower()
pc = WordCloud(scale=8, background_color="white").generate(" ".join(pub))
plt.figure(figsize=(10,10))
plt.imshow(pc)
plt.axis("off")
plt.show()
---
top_pub = "### Top 50 phrase in publication title\n\n"
pc_words = dict(islice(dict(sorted(WordCloud(scale=8, background_color="white").process_text(" ".join(pub)).items(), key=lambda item: item[1], reverse=True)).items(), 50))
for word, freq in pc_words.items():
top_pub += f"1. {word} ({freq})\n"
display(Markdown(top_pub))