import pandas
from wordcloud import WordCloud
import matplotlib.pyplot as plt

pandas.set_option('display.max_columns', 500)
pandas.options.display.max_colwidth = 500

#LOAD CORPUS
# https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
myCorpus = pandas.read_excel('C:/Workshop/TexasCOVIDCorpus.xlsx',header=0)

#Look at header row and first row only
#print(myCorpus.iloc[0])

#Look at first article info

# 0 is row number, '...' is a column name
print(myCorpus.loc[0,'Article Title'])
print()

# ['..', '...'] is a list of columns you want
print(myCorpus.loc[0,['Article Title','Source Title']]) 
print()

#Look at Abstract
print(myCorpus.loc[0,'Abstract']) 
print()

#Look at the first 5 values in the Discipline column
print(myCorpus['Discipline'].iloc[0:5])
print()

#Create a variable for a theme you want to look for
myTheme = 'mortality'

#Or ask the user to type one in
myTheme = input("Please type a word you want to search for: ").lower()

#Create a new column of all lowercase Abstract
#This is in case your search term appears both as 'Masks' and 'masks'
myCorpus['Abstract_lower'] = myCorpus['Abstract'].str.lower()

#Find all the abstracts which contain your search term
#Make sure to search in your LOWER case column
result = myCorpus[myCorpus['Abstract_lower'].str.contains(myTheme)].index
print(result)

#Do the same, but also make it into a list
resultList = myCorpus[myCorpus['Abstract_lower'].str.contains(myTheme)].index.tolist()
print(resultList)

#Retrieve all the values for the Discipline column, for ONLY your result rows
myDisciplines = myCorpus.loc[resultList,'Discipline']

#Print the results without repetitions
print(myDisciplines.drop_duplicates())
print()

#Make a simple report
print("Theme Keyword: " + myTheme)
print("Disciplines addressing that theme:")
print (myDisciplines.drop_duplicates())

#Store your results in a list
myDisciplinesList = myDisciplines.drop_duplicates().tolist()

#Get all the paper titles and their journals, for ONLY your result rows.
allThemePapers = myCorpus.loc[resultList,['Article Title','Source Title']]

#Make your dataframe into a list of lists, for prettier reporting
allThemePapersList = allThemePapers.values.tolist()

#Loop over your list of lists for a report
print("\n\nThere are " + str(len(allThemePapersList)) \
      + " papers Addressing " + myTheme + "\n")
for paper in allThemePapersList:
    print(paper[0])
    print(paper[1])
    print()

#Extract more information for a saved list
columnsToKeep = ['Article Title','Source Title','Publication Year','Volume','Issue',\
                 'Discipline','Abstract','Abstract_lower']
allThemePapers = myCorpus.loc[resultList,columnsToKeep]

#Some housekeeping for missing values
allThemePapers['Publication Year'] = allThemePapers['Publication Year'].fillna(0)
allThemePapers['Volume'] = allThemePapers['Volume'].fillna(0)

#Make year and volume into integers
allThemePapers['Publication Year'] = allThemePapers['Publication Year'].astype('int')
allThemePapers['Volume'] = allThemePapers['Volume'].astype('int')

#Group the papers by discipline
allThemePapers = allThemePapers.sort_values('Discipline')

#Write the result to Excel file
allThemePapers.to_excel("C:/Workshop/Papers_on_" + myTheme + ".xlsx")

#Make a word cloud for your customized group
cloudString = ' '.join(allThemePapers['Abstract_lower'])

#Some words we already know are there because of our Web Of Science search
#Omit those from the word cloud
wordsToOmit = ['covid','covid-19','texas','study','method','data','results',\
               'pandemic','median','conclusion','participants','scale'\
               'questionnaire','control','research','effect']
for word in wordsToOmit:
    cloudString = cloudString.replace(word, '')

wordcloud = WordCloud(width=800, height=640, background_color='white', \
                      min_font_size=10).generate(cloudString)

# display the word cloud
plt.figure(figsize=(10, 7), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()
#___________________________________________________________________

#If your first theme returns too many papers and/or disciplines, you can
#do it again, using your first result as input. Try 'mental', then 'access'
myTheme2 = input("Please type a word you want to search for: ").lower()

#Find all the abstracts which contain your search term
#Make sure to search in your LOWER case column
result2 = allThemePapers[allThemePapers['Abstract_lower'].str.contains(myTheme2)].index
print(result2)

#Do the same, but also make it into a list
resultList2 = allThemePapers[allThemePapers['Abstract_lower'].str.contains(myTheme2)].index.tolist()
print(resultList2)

#Retrieve all the values for the Discipline column, for ONLY your result rows
myDisciplines2 = allThemePapers.loc[resultList2,'Discipline']

#Print the results without repetitions
print(myDisciplines2.drop_duplicates())
print()

#Make a simple report
print("Theme Keywords: " + myTheme + " " + myTheme2)
print("Disciplines addressing those themes:")
print (myDisciplines2.drop_duplicates())

#Store your results in a list
myDisciplinesList2 = myDisciplines2.drop_duplicates().tolist()

#Get all the paper titles and their journals, for ONLY your result rows.
allThemePapers2 = allThemePapers.loc[resultList2,['Article Title','Source Title']]

#Make your dataframe into a list of lists, for prettier reporting
allThemePapersList2 = allThemePapers2.values.tolist()

#Loop over your list of lists for a report
print("\n\nThere are " + str(len(allThemePapersList2)) \
      + " papers addressing " + myTheme + " and " + myTheme2 + "\n")
for paper in allThemePapersList2:
    print(paper[0])
    print(paper[1])
    print()

#Extract more information for a saved list
columnsToKeep = ['Article Title','Source Title','Publication Year','Volume','Issue',\
                 'Discipline','Abstract','Abstract_lower']
allThemePapers2 = myCorpus.loc[resultList2,columnsToKeep]

#Group the papers by discipline
allThemePapers2 = allThemePapers2.sort_values('Discipline')

#Write the result to Excel file
allThemePapers2.to_excel("C:/Workshop/Papers_on_" + myTheme + "_" + myTheme2 + ".xlsx")
 
#Make a word cloud for your customized group
cloudString2 = ' '.join(allThemePapers2['Abstract_lower'])

#Some words we already know are there because of our Web Of Science search
#Omit those from the word cloud
wordsToOmit = ['covid','covid-19','texas','study','method','data','results',\
               'pandemic','median','conclusion','participants','scale'\
               'questionnaire','control','research','effect']
for word in wordsToOmit:
    cloudString2 = cloudString2.replace(word, '')

wordcloud2 = WordCloud(width=800, height=640, background_color='white', \
                      min_font_size=10).generate(cloudString2)

# display the word cloud
plt.figure(figsize=(10, 7), facecolor=None)
plt.imshow(wordcloud2)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()