import pandas from wordcloud import WordCloud import matplotlib.pyplot as plt pandas.set_option('display.max_columns', 500) pandas.options.display.max_colwidth = 500 #LOAD CORPUS # https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html myCorpus = pandas.read_excel('C:/Workshop/TexasCOVIDCorpus.xlsx',header=0) #Look at header row and first row only #print(myCorpus.iloc[0]) #Look at first article info # 0 is row number, '...' is a column name print(myCorpus.loc[0,'Article Title']) print() # ['..', '...'] is a list of columns you want print(myCorpus.loc[0,['Article Title','Source Title']]) print() #Look at Abstract print(myCorpus.loc[0,'Abstract']) print() #Look at the first 5 values in the Discipline column print(myCorpus['Discipline'].iloc[0:5]) print() #Create a variable for a theme you want to look for myTheme = 'mortality' #Or ask the user to type one in myTheme = input("Please type a word you want to search for: ").lower() #Create a new column of all lowercase Abstract #This is in case your search term appears both as 'Masks' and 'masks' myCorpus['Abstract_lower'] = myCorpus['Abstract'].str.lower() #Find all the abstracts which contain your search term #Make sure to search in your LOWER case column result = myCorpus[myCorpus['Abstract_lower'].str.contains(myTheme)].index print(result) #Do the same, but also make it into a list resultList = myCorpus[myCorpus['Abstract_lower'].str.contains(myTheme)].index.tolist() print(resultList) #Retrieve all the values for the Discipline column, for ONLY your result rows myDisciplines = myCorpus.loc[resultList,'Discipline'] #Print the results without repetitions print(myDisciplines.drop_duplicates()) print() #Make a simple report print("Theme Keyword: " + myTheme) print("Disciplines addressing that theme:") print (myDisciplines.drop_duplicates()) #Store your results in a list myDisciplinesList = myDisciplines.drop_duplicates().tolist() #Get all the paper titles and their journals, for ONLY your result rows. allThemePapers = myCorpus.loc[resultList,['Article Title','Source Title']] #Make your dataframe into a list of lists, for prettier reporting allThemePapersList = allThemePapers.values.tolist() #Loop over your list of lists for a report print("\n\nThere are " + str(len(allThemePapersList)) \ + " papers Addressing " + myTheme + "\n") for paper in allThemePapersList: print(paper[0]) print(paper[1]) print() #Extract more information for a saved list columnsToKeep = ['Article Title','Source Title','Publication Year','Volume','Issue',\ 'Discipline','Abstract','Abstract_lower'] allThemePapers = myCorpus.loc[resultList,columnsToKeep] #Some housekeeping for missing values allThemePapers['Publication Year'] = allThemePapers['Publication Year'].fillna(0) allThemePapers['Volume'] = allThemePapers['Volume'].fillna(0) #Make year and volume into integers allThemePapers['Publication Year'] = allThemePapers['Publication Year'].astype('int') allThemePapers['Volume'] = allThemePapers['Volume'].astype('int') #Group the papers by discipline allThemePapers = allThemePapers.sort_values('Discipline') #Write the result to Excel file allThemePapers.to_excel("C:/Workshop/Papers_on_" + myTheme + ".xlsx") #Make a word cloud for your customized group cloudString = ' '.join(allThemePapers['Abstract_lower']) #Some words we already know are there because of our Web Of Science search #Omit those from the word cloud wordsToOmit = ['covid','covid-19','texas','study','method','data','results',\ 'pandemic','median','conclusion','participants','scale'\ 'questionnaire','control','research','effect'] for word in wordsToOmit: cloudString = cloudString.replace(word, '') wordcloud = WordCloud(width=800, height=640, background_color='white', \ min_font_size=10).generate(cloudString) # display the word cloud plt.figure(figsize=(10, 7), facecolor=None) plt.imshow(wordcloud) plt.axis("off") plt.tight_layout(pad=0) plt.show() #___________________________________________________________________ #If your first theme returns too many papers and/or disciplines, you can #do it again, using your first result as input. Try 'mental', then 'access' myTheme2 = input("Please type a word you want to search for: ").lower() #Find all the abstracts which contain your search term #Make sure to search in your LOWER case column result2 = allThemePapers[allThemePapers['Abstract_lower'].str.contains(myTheme2)].index print(result2) #Do the same, but also make it into a list resultList2 = allThemePapers[allThemePapers['Abstract_lower'].str.contains(myTheme2)].index.tolist() print(resultList2) #Retrieve all the values for the Discipline column, for ONLY your result rows myDisciplines2 = allThemePapers.loc[resultList2,'Discipline'] #Print the results without repetitions print(myDisciplines2.drop_duplicates()) print() #Make a simple report print("Theme Keywords: " + myTheme + " " + myTheme2) print("Disciplines addressing those themes:") print (myDisciplines2.drop_duplicates()) #Store your results in a list myDisciplinesList2 = myDisciplines2.drop_duplicates().tolist() #Get all the paper titles and their journals, for ONLY your result rows. allThemePapers2 = allThemePapers.loc[resultList2,['Article Title','Source Title']] #Make your dataframe into a list of lists, for prettier reporting allThemePapersList2 = allThemePapers2.values.tolist() #Loop over your list of lists for a report print("\n\nThere are " + str(len(allThemePapersList2)) \ + " papers addressing " + myTheme + " and " + myTheme2 + "\n") for paper in allThemePapersList2: print(paper[0]) print(paper[1]) print() #Extract more information for a saved list columnsToKeep = ['Article Title','Source Title','Publication Year','Volume','Issue',\ 'Discipline','Abstract','Abstract_lower'] allThemePapers2 = myCorpus.loc[resultList2,columnsToKeep] #Group the papers by discipline allThemePapers2 = allThemePapers2.sort_values('Discipline') #Write the result to Excel file allThemePapers2.to_excel("C:/Workshop/Papers_on_" + myTheme + "_" + myTheme2 + ".xlsx") #Make a word cloud for your customized group cloudString2 = ' '.join(allThemePapers2['Abstract_lower']) #Some words we already know are there because of our Web Of Science search #Omit those from the word cloud wordsToOmit = ['covid','covid-19','texas','study','method','data','results',\ 'pandemic','median','conclusion','participants','scale'\ 'questionnaire','control','research','effect'] for word in wordsToOmit: cloudString2 = cloudString2.replace(word, '') wordcloud2 = WordCloud(width=800, height=640, background_color='white', \ min_font_size=10).generate(cloudString2) # display the word cloud plt.figure(figsize=(10, 7), facecolor=None) plt.imshow(wordcloud2) plt.axis("off") plt.tight_layout(pad=0) plt.show()