surveyer.surveyreader
Read info from survey export and make useable
1"""Read info from survey export and make useable""" 2 3import pandas as pd 4 5class DataSet: 6 def __init__(self, filepath): 7 self.metadata = {} 8 self.data = pd.DataFrame() 9 10 self.read(filepath) 11 self.preprocess() 12 13 def read(self, filepath): 14 """Reading the survey results from the export from EU survey""" 15 16 if filepath.find("xls") > -1: 17 self.data = pd.read_excel(filepath, header=3) 18 elif filepath.find("h5") > -1: 19 self.data = pd.read_hdf(filepath, key="survey") 20 21 def _anonymize(self, internaluse=False, outpath=""): 22 """Deleting entries with personal information. 23 If internaluse = True, keeping details on affiliation. 24 If outpath is provided, anonymized dataset is written to xls.""" 25 26 if "Name" in self.data.columns: 27 self.data = self.data.drop( 28 columns=[ 29 "Name", 30 "email", 31 "Invitation number", 32 "Contribution ID", 33 "User name", 34 "Languages", 35 ] 36 ) 37 38 if ( 39 not internaluse 40 and "Further description of your role or group" in self.data.columns 41 ): 42 self.data = self.data.drop( 43 columns=[ 44 "Your institution / organisation / group (for internal purposes only, information will not be public).", 45 "Further description of your role or group", 46 ] 47 ) 48 49 if outpath: 50 self.data.to_hdf(outpath, key="survey") 51 52 def _adapt_column_names(self): 53 """Adapt entries and columns of the table""" 54 55 for colname in self.data.columns: 56 colname_new = str(colname).replace("applicable:", "applicable ") 57 colname_new = str(colname).replace(": Confidence", "- Confidence") 58 self.data = self.data.rename(columns={colname: colname_new}) 59 60 def _create_question_metadata(self): 61 """Extracting metadata from the survey file. 62 For each question, an identifier (A...) is defined, and subquestions grouped.""" 63 questions = {} 64 65 for colname in self.data.columns: 66 subtexts = colname.split(":") 67 questionid = False 68 for qid in questions: 69 if questions[qid]["question"] == subtexts[0]: 70 questionid = qid 71 if not questionid: 72 if len(subtexts) == 1: 73 subtexts[0] = colname 74 questions.setdefault( 75 "A" + str(len(questions)), 76 { 77 "question": subtexts[0], 78 "subquestions": [], 79 "colnames": [], 80 "entrytype": "undefined", 81 "params": {}, 82 }, 83 ) 84 qid = "A" + str(len(questions) - 1) 85 86 if len(subtexts) > 1: 87 questions[qid]["subquestions"].append(subtexts[1 : len(subtexts)]) 88 questions[qid]["colnames"].append(colname) 89 self.metadata = questions 90 91 def preprocess(self): 92 """Preprocessing the survey data for use""" 93 94 self._anonymize() 95 self._adapt_column_names() 96 self._create_question_metadata() 97 98 self._identify_question_types() 99 100 self._adapt_answers() 101 102 def _identify_question_types(self): 103 """Identifying question types according to metadata""" 104 105 for qid in self.metadata: 106 107 columns = [] 108 self.metadata[qid]["params"].setdefault("subtypes", {}) 109 self.metadata[qid]["params"].setdefault("options", {}) 110 111 if len(self.metadata[qid]["colnames"]) == 0: 112 columns = [self.metadata[qid]["question"]] 113 else: 114 columns = self.metadata[qid]["colnames"] 115 self.metadata[qid]["entrytype"] = "multiple" 116 117 for colname in columns: 118 columntype = "" 119 120 answersall = [x for x in self.data[colname] if pd.notna(x)] 121 uniqueanswers = list(set(answersall)) 122 123 dtype = self.data[colname].dtype 124 125 if str(dtype) == "float64": 126 columntype = "undefined" 127 128 elif str(dtype) == "datetime64[ns]": 129 columntype = "date" 130 131 elif len(answersall) == 0: 132 columntype = "no answers" 133 134 elif isinstance(uniqueanswers[0], str): 135 columntype = "text" 136 137 subanswers = [] 138 samelength = True 139 entrylength = 1 140 notsingle = False 141 142 for answers in answersall: 143 answerssplit = answers.split(";") 144 if entrylength == 1: 145 entrylength = len(answerssplit) 146 147 if answers == answersall[0] and len(answerssplit) > 1: 148 notsingle = True 149 150 if len(answerssplit) != entrylength and samelength: 151 samelength = False 152 153 for subans in answerssplit: 154 if not subans.lstrip(" ") in subanswers: 155 subanswers.append(subans.lstrip(" ")) 156 157 # see if answers contain /10 -> rating 158 159 if ( 160 str(uniqueanswers[0]).find("/5") > 0 161 or str(uniqueanswers[0]).find("/10") > 0 162 ) and str(uniqueanswers[0]).find("ttp") < 0: 163 columntype = "rating" 164 165 # see if entries can be split with different length -> 166 # multichoice 167 168 elif entrylength > 1 and not samelength: 169 columntype = "enumerate" 170 171 # see if entries can be split with same length -> ranking 172 173 elif entrylength > 1 and samelength and notsingle: 174 columntype = "ranking" 175 176 # see if entries are repetitive -> select 177 178 elif entrylength == 1 and len(subanswers) < len(answersall): 179 columntype = "select" 180 181 if columntype != "text": 182 if columntype == "rating": 183 self.metadata[qid]["params"]["options"].setdefault( 184 "factor", int(str(uniqueanswers[0]).split("/")[1]) 185 ) 186 else: 187 self.metadata[qid]["params"]["options"].setdefault( 188 colname, subanswers 189 ) 190 191 if not columntype: 192 columntype = "undefined" 193 194 if self.metadata[qid]["entrytype"] == "multiple": 195 self.metadata[qid]["params"]["subtypes"].setdefault( 196 colname, columntype 197 ) 198 else: 199 self.metadata[qid]["entrytype"] = columntype 200 201 def _adapt_answers(self): 202 """Change the entries in the data table for better processing""" 203 pass 204 205 def list_questions(self, filter_type=""): 206 """Returns DataFrame with metadata of the questions. 207 If filter_type provides type of question, only matching entries are listed 208 """ 209 210 meta_df = pd.DataFrame(self.metadata).T 211 if filter_type: 212 meta_df = meta_df[meta_df["entrytype"] == filter_type] 213 return meta_df 214 215 def get_question_data(self, questionid): 216 """Returns dataframe according to question identifier""" 217 colnames = [] 218 if len(self.metadata[questionid]["colnames"]) > 0: 219 colnames = self.metadata[questionid]["colnames"] 220 else: 221 colnames = [self.metadata[questionid]["question"]] 222 223 return self.data.loc[:, colnames] 224 225 def _get_colnames(self, questionids, acceptedtypes=[]): 226 """Returns column names for given question IDs, facilitating filtering for specific types.""" 227 228 colnames = [] 229 230 for qid in questionids: 231 if ( 232 acceptedtypes 233 and self.metadata[qid]["entrytype"] in acceptedtypes 234 or not acceptedtypes 235 ): 236 colnames.append(self.metadata[qid]["question"]) 237 elif self.metadata[qid]["entrytype"] == "multiple": 238 for colname in self.metadata[qid]["params"]["subtypes"]: 239 if ( 240 self.metadata[qid]["params"]["subtypes"][colname] 241 in acceptedtypes 242 ): 243 colnames.append(colname) 244 else: 245 print( 246 "For question-ID", 247 qid, 248 "the type is not an accepted type ", 249 acceptedtypes, 250 " or 'multiple', will not add type", 251 self.metadata[qid]["entrytype"], 252 ) 253 254 return colnames 255 256 def extract_subset(self, questionids, acceptedtypes=[]): 257 """Returns dataframe with subset of data according to questionids, and filters for provided types of questions.""" 258 colnames = self._get_colnames(questionids, acceptedtypes) 259 260 return self.data[colnames]
class
DataSet:
6class DataSet: 7 def __init__(self, filepath): 8 self.metadata = {} 9 self.data = pd.DataFrame() 10 11 self.read(filepath) 12 self.preprocess() 13 14 def read(self, filepath): 15 """Reading the survey results from the export from EU survey""" 16 17 if filepath.find("xls") > -1: 18 self.data = pd.read_excel(filepath, header=3) 19 elif filepath.find("h5") > -1: 20 self.data = pd.read_hdf(filepath, key="survey") 21 22 def _anonymize(self, internaluse=False, outpath=""): 23 """Deleting entries with personal information. 24 If internaluse = True, keeping details on affiliation. 25 If outpath is provided, anonymized dataset is written to xls.""" 26 27 if "Name" in self.data.columns: 28 self.data = self.data.drop( 29 columns=[ 30 "Name", 31 "email", 32 "Invitation number", 33 "Contribution ID", 34 "User name", 35 "Languages", 36 ] 37 ) 38 39 if ( 40 not internaluse 41 and "Further description of your role or group" in self.data.columns 42 ): 43 self.data = self.data.drop( 44 columns=[ 45 "Your institution / organisation / group (for internal purposes only, information will not be public).", 46 "Further description of your role or group", 47 ] 48 ) 49 50 if outpath: 51 self.data.to_hdf(outpath, key="survey") 52 53 def _adapt_column_names(self): 54 """Adapt entries and columns of the table""" 55 56 for colname in self.data.columns: 57 colname_new = str(colname).replace("applicable:", "applicable ") 58 colname_new = str(colname).replace(": Confidence", "- Confidence") 59 self.data = self.data.rename(columns={colname: colname_new}) 60 61 def _create_question_metadata(self): 62 """Extracting metadata from the survey file. 63 For each question, an identifier (A...) is defined, and subquestions grouped.""" 64 questions = {} 65 66 for colname in self.data.columns: 67 subtexts = colname.split(":") 68 questionid = False 69 for qid in questions: 70 if questions[qid]["question"] == subtexts[0]: 71 questionid = qid 72 if not questionid: 73 if len(subtexts) == 1: 74 subtexts[0] = colname 75 questions.setdefault( 76 "A" + str(len(questions)), 77 { 78 "question": subtexts[0], 79 "subquestions": [], 80 "colnames": [], 81 "entrytype": "undefined", 82 "params": {}, 83 }, 84 ) 85 qid = "A" + str(len(questions) - 1) 86 87 if len(subtexts) > 1: 88 questions[qid]["subquestions"].append(subtexts[1 : len(subtexts)]) 89 questions[qid]["colnames"].append(colname) 90 self.metadata = questions 91 92 def preprocess(self): 93 """Preprocessing the survey data for use""" 94 95 self._anonymize() 96 self._adapt_column_names() 97 self._create_question_metadata() 98 99 self._identify_question_types() 100 101 self._adapt_answers() 102 103 def _identify_question_types(self): 104 """Identifying question types according to metadata""" 105 106 for qid in self.metadata: 107 108 columns = [] 109 self.metadata[qid]["params"].setdefault("subtypes", {}) 110 self.metadata[qid]["params"].setdefault("options", {}) 111 112 if len(self.metadata[qid]["colnames"]) == 0: 113 columns = [self.metadata[qid]["question"]] 114 else: 115 columns = self.metadata[qid]["colnames"] 116 self.metadata[qid]["entrytype"] = "multiple" 117 118 for colname in columns: 119 columntype = "" 120 121 answersall = [x for x in self.data[colname] if pd.notna(x)] 122 uniqueanswers = list(set(answersall)) 123 124 dtype = self.data[colname].dtype 125 126 if str(dtype) == "float64": 127 columntype = "undefined" 128 129 elif str(dtype) == "datetime64[ns]": 130 columntype = "date" 131 132 elif len(answersall) == 0: 133 columntype = "no answers" 134 135 elif isinstance(uniqueanswers[0], str): 136 columntype = "text" 137 138 subanswers = [] 139 samelength = True 140 entrylength = 1 141 notsingle = False 142 143 for answers in answersall: 144 answerssplit = answers.split(";") 145 if entrylength == 1: 146 entrylength = len(answerssplit) 147 148 if answers == answersall[0] and len(answerssplit) > 1: 149 notsingle = True 150 151 if len(answerssplit) != entrylength and samelength: 152 samelength = False 153 154 for subans in answerssplit: 155 if not subans.lstrip(" ") in subanswers: 156 subanswers.append(subans.lstrip(" ")) 157 158 # see if answers contain /10 -> rating 159 160 if ( 161 str(uniqueanswers[0]).find("/5") > 0 162 or str(uniqueanswers[0]).find("/10") > 0 163 ) and str(uniqueanswers[0]).find("ttp") < 0: 164 columntype = "rating" 165 166 # see if entries can be split with different length -> 167 # multichoice 168 169 elif entrylength > 1 and not samelength: 170 columntype = "enumerate" 171 172 # see if entries can be split with same length -> ranking 173 174 elif entrylength > 1 and samelength and notsingle: 175 columntype = "ranking" 176 177 # see if entries are repetitive -> select 178 179 elif entrylength == 1 and len(subanswers) < len(answersall): 180 columntype = "select" 181 182 if columntype != "text": 183 if columntype == "rating": 184 self.metadata[qid]["params"]["options"].setdefault( 185 "factor", int(str(uniqueanswers[0]).split("/")[1]) 186 ) 187 else: 188 self.metadata[qid]["params"]["options"].setdefault( 189 colname, subanswers 190 ) 191 192 if not columntype: 193 columntype = "undefined" 194 195 if self.metadata[qid]["entrytype"] == "multiple": 196 self.metadata[qid]["params"]["subtypes"].setdefault( 197 colname, columntype 198 ) 199 else: 200 self.metadata[qid]["entrytype"] = columntype 201 202 def _adapt_answers(self): 203 """Change the entries in the data table for better processing""" 204 pass 205 206 def list_questions(self, filter_type=""): 207 """Returns DataFrame with metadata of the questions. 208 If filter_type provides type of question, only matching entries are listed 209 """ 210 211 meta_df = pd.DataFrame(self.metadata).T 212 if filter_type: 213 meta_df = meta_df[meta_df["entrytype"] == filter_type] 214 return meta_df 215 216 def get_question_data(self, questionid): 217 """Returns dataframe according to question identifier""" 218 colnames = [] 219 if len(self.metadata[questionid]["colnames"]) > 0: 220 colnames = self.metadata[questionid]["colnames"] 221 else: 222 colnames = [self.metadata[questionid]["question"]] 223 224 return self.data.loc[:, colnames] 225 226 def _get_colnames(self, questionids, acceptedtypes=[]): 227 """Returns column names for given question IDs, facilitating filtering for specific types.""" 228 229 colnames = [] 230 231 for qid in questionids: 232 if ( 233 acceptedtypes 234 and self.metadata[qid]["entrytype"] in acceptedtypes 235 or not acceptedtypes 236 ): 237 colnames.append(self.metadata[qid]["question"]) 238 elif self.metadata[qid]["entrytype"] == "multiple": 239 for colname in self.metadata[qid]["params"]["subtypes"]: 240 if ( 241 self.metadata[qid]["params"]["subtypes"][colname] 242 in acceptedtypes 243 ): 244 colnames.append(colname) 245 else: 246 print( 247 "For question-ID", 248 qid, 249 "the type is not an accepted type ", 250 acceptedtypes, 251 " or 'multiple', will not add type", 252 self.metadata[qid]["entrytype"], 253 ) 254 255 return colnames 256 257 def extract_subset(self, questionids, acceptedtypes=[]): 258 """Returns dataframe with subset of data according to questionids, and filters for provided types of questions.""" 259 colnames = self._get_colnames(questionids, acceptedtypes) 260 261 return self.data[colnames]
def
read(self, filepath):
14 def read(self, filepath): 15 """Reading the survey results from the export from EU survey""" 16 17 if filepath.find("xls") > -1: 18 self.data = pd.read_excel(filepath, header=3) 19 elif filepath.find("h5") > -1: 20 self.data = pd.read_hdf(filepath, key="survey")
Reading the survey results from the export from EU survey
def
preprocess(self):
92 def preprocess(self): 93 """Preprocessing the survey data for use""" 94 95 self._anonymize() 96 self._adapt_column_names() 97 self._create_question_metadata() 98 99 self._identify_question_types() 100 101 self._adapt_answers()
Preprocessing the survey data for use
def
list_questions(self, filter_type=''):
206 def list_questions(self, filter_type=""): 207 """Returns DataFrame with metadata of the questions. 208 If filter_type provides type of question, only matching entries are listed 209 """ 210 211 meta_df = pd.DataFrame(self.metadata).T 212 if filter_type: 213 meta_df = meta_df[meta_df["entrytype"] == filter_type] 214 return meta_df
Returns DataFrame with metadata of the questions. If filter_type provides type of question, only matching entries are listed
def
get_question_data(self, questionid):
216 def get_question_data(self, questionid): 217 """Returns dataframe according to question identifier""" 218 colnames = [] 219 if len(self.metadata[questionid]["colnames"]) > 0: 220 colnames = self.metadata[questionid]["colnames"] 221 else: 222 colnames = [self.metadata[questionid]["question"]] 223 224 return self.data.loc[:, colnames]
Returns dataframe according to question identifier
def
extract_subset(self, questionids, acceptedtypes=[]):
257 def extract_subset(self, questionids, acceptedtypes=[]): 258 """Returns dataframe with subset of data according to questionids, and filters for provided types of questions.""" 259 colnames = self._get_colnames(questionids, acceptedtypes) 260 261 return self.data[colnames]
Returns dataframe with subset of data according to questionids, and filters for provided types of questions.