surveyer.surveyreader

Read info from survey export and make useable

  1"""Read info from survey export and make useable"""
  2
  3import pandas as pd
  4
  5class DataSet:
  6    def __init__(self, filepath):
  7        self.metadata = {}
  8        self.data = pd.DataFrame()
  9
 10        self.read(filepath)
 11        self.preprocess()
 12
 13    def read(self, filepath):
 14        """Reading the survey results from the export from EU survey"""
 15
 16        if filepath.find("xls") > -1:
 17            self.data = pd.read_excel(filepath, header=3)
 18        elif filepath.find("h5") > -1:
 19            self.data = pd.read_hdf(filepath, key="survey")
 20
 21    def _anonymize(self, internaluse=False, outpath=""):
 22        """Deleting entries with personal information.
 23        If internaluse = True, keeping details on affiliation.
 24        If outpath is provided, anonymized dataset is written to xls."""
 25
 26        if "Name" in self.data.columns:
 27            self.data = self.data.drop(
 28                columns=[
 29                    "Name",
 30                    "email",
 31                    "Invitation number",
 32                    "Contribution ID",
 33                    "User name",
 34                    "Languages",
 35                ]
 36            )
 37
 38        if (
 39            not internaluse
 40            and "Further description of your role or group" in self.data.columns
 41        ):
 42            self.data = self.data.drop(
 43                columns=[
 44                    "Your institution / organisation / group (for internal purposes only, information will not be public).",
 45                    "Further description of your role or group",
 46                ]
 47            )
 48
 49        if outpath:
 50            self.data.to_hdf(outpath, key="survey")
 51
 52    def _adapt_column_names(self):
 53        """Adapt entries and columns of the table"""
 54
 55        for colname in self.data.columns:
 56            colname_new = str(colname).replace("applicable:", "applicable ")
 57            colname_new = str(colname).replace(": Confidence", "- Confidence")
 58            self.data = self.data.rename(columns={colname: colname_new})
 59
 60    def _create_question_metadata(self):
 61        """Extracting metadata from the survey file.
 62        For each question, an identifier (A...) is defined, and subquestions grouped."""
 63        questions = {}
 64
 65        for colname in self.data.columns:
 66            subtexts = colname.split(":")
 67            questionid = False
 68            for qid in questions:
 69                if questions[qid]["question"] == subtexts[0]:
 70                    questionid = qid
 71            if not questionid:
 72                if len(subtexts) == 1:
 73                    subtexts[0] = colname
 74                questions.setdefault(
 75                    "A" + str(len(questions)),
 76                    {
 77                        "question": subtexts[0],
 78                        "subquestions": [],
 79                        "colnames": [],
 80                        "entrytype": "undefined",
 81                        "params": {},
 82                    },
 83                )
 84                qid = "A" + str(len(questions) - 1)
 85
 86            if len(subtexts) > 1:
 87                questions[qid]["subquestions"].append(subtexts[1 : len(subtexts)])
 88                questions[qid]["colnames"].append(colname)
 89        self.metadata = questions
 90
 91    def preprocess(self):
 92        """Preprocessing the survey data for use"""
 93
 94        self._anonymize()
 95        self._adapt_column_names()
 96        self._create_question_metadata()
 97
 98        self._identify_question_types()
 99
100        self._adapt_answers()
101
102    def _identify_question_types(self):
103        """Identifying question types according to metadata"""
104
105        for qid in self.metadata:
106
107            columns = []
108            self.metadata[qid]["params"].setdefault("subtypes", {})
109            self.metadata[qid]["params"].setdefault("options", {})
110
111            if len(self.metadata[qid]["colnames"]) == 0:
112                columns = [self.metadata[qid]["question"]]
113            else:
114                columns = self.metadata[qid]["colnames"]
115                self.metadata[qid]["entrytype"] = "multiple"
116
117            for colname in columns:
118                columntype = ""
119
120                answersall = [x for x in self.data[colname] if pd.notna(x)]
121                uniqueanswers = list(set(answersall))
122
123                dtype = self.data[colname].dtype
124
125                if str(dtype) == "float64":
126                    columntype = "undefined"
127
128                elif str(dtype) == "datetime64[ns]":
129                    columntype = "date"
130
131                elif len(answersall) == 0:
132                    columntype = "no answers"
133
134                elif isinstance(uniqueanswers[0], str):
135                    columntype = "text"
136
137                    subanswers = []
138                    samelength = True
139                    entrylength = 1
140                    notsingle = False
141
142                    for answers in answersall:
143                        answerssplit = answers.split(";")
144                        if entrylength == 1:
145                            entrylength = len(answerssplit)
146
147                        if answers == answersall[0] and len(answerssplit) > 1:
148                            notsingle = True
149
150                        if len(answerssplit) != entrylength and samelength:
151                            samelength = False
152
153                        for subans in answerssplit:
154                            if not subans.lstrip(" ") in subanswers:
155                                subanswers.append(subans.lstrip(" "))
156
157                    # see if answers contain /10 -> rating
158
159                    if (
160                        str(uniqueanswers[0]).find("/5") > 0
161                        or str(uniqueanswers[0]).find("/10") > 0
162                    ) and str(uniqueanswers[0]).find("ttp") < 0:
163                        columntype = "rating"
164
165                    # see if entries can be split with different length ->
166                    # multichoice
167
168                    elif entrylength > 1 and not samelength:
169                        columntype = "enumerate"
170
171                    # see if entries can be split with same length -> ranking
172
173                    elif entrylength > 1 and samelength and notsingle:
174                        columntype = "ranking"
175
176                    # see if entries are repetitive -> select
177
178                    elif entrylength == 1 and len(subanswers) < len(answersall):
179                        columntype = "select"
180
181                    if columntype != "text":
182                        if columntype == "rating":
183                            self.metadata[qid]["params"]["options"].setdefault(
184                                "factor", int(str(uniqueanswers[0]).split("/")[1])
185                            )
186                        else:
187                            self.metadata[qid]["params"]["options"].setdefault(
188                                colname, subanswers
189                            )
190
191                if not columntype:
192                    columntype = "undefined"
193
194                if self.metadata[qid]["entrytype"] == "multiple":
195                    self.metadata[qid]["params"]["subtypes"].setdefault(
196                        colname, columntype
197                    )
198                else:
199                    self.metadata[qid]["entrytype"] = columntype
200
201    def _adapt_answers(self):
202        """Change the entries in the data table for better processing"""
203        pass
204
205    def list_questions(self, filter_type=""):
206        """Returns DataFrame with metadata of the questions.
207        If filter_type provides type of question, only matching entries are listed
208        """
209
210        meta_df = pd.DataFrame(self.metadata).T
211        if filter_type:
212            meta_df = meta_df[meta_df["entrytype"] == filter_type]
213        return meta_df
214
215    def get_question_data(self, questionid):
216        """Returns dataframe according to question identifier"""
217        colnames = []
218        if len(self.metadata[questionid]["colnames"]) > 0:
219            colnames = self.metadata[questionid]["colnames"]
220        else:
221            colnames = [self.metadata[questionid]["question"]]
222
223        return self.data.loc[:, colnames]
224
225    def _get_colnames(self, questionids, acceptedtypes=[]):
226        """Returns column names for given question IDs, facilitating filtering for specific types."""
227
228        colnames = []
229
230        for qid in questionids:
231            if (
232                acceptedtypes
233                and self.metadata[qid]["entrytype"] in acceptedtypes
234                or not acceptedtypes
235            ):
236                colnames.append(self.metadata[qid]["question"])
237            elif self.metadata[qid]["entrytype"] == "multiple":
238                for colname in self.metadata[qid]["params"]["subtypes"]:
239                    if (
240                        self.metadata[qid]["params"]["subtypes"][colname]
241                        in acceptedtypes
242                    ):
243                        colnames.append(colname)
244            else:
245                print(
246                    "For question-ID",
247                    qid,
248                    "the type is not an accepted type ",
249                    acceptedtypes,
250                    " or 'multiple', will not add type",
251                    self.metadata[qid]["entrytype"],
252                )
253
254        return colnames
255
256    def extract_subset(self, questionids, acceptedtypes=[]):
257        """Returns dataframe with subset of data according to questionids, and filters for provided types of questions."""
258        colnames = self._get_colnames(questionids, acceptedtypes)
259
260        return self.data[colnames]
class DataSet:
  6class DataSet:
  7    def __init__(self, filepath):
  8        self.metadata = {}
  9        self.data = pd.DataFrame()
 10
 11        self.read(filepath)
 12        self.preprocess()
 13
 14    def read(self, filepath):
 15        """Reading the survey results from the export from EU survey"""
 16
 17        if filepath.find("xls") > -1:
 18            self.data = pd.read_excel(filepath, header=3)
 19        elif filepath.find("h5") > -1:
 20            self.data = pd.read_hdf(filepath, key="survey")
 21
 22    def _anonymize(self, internaluse=False, outpath=""):
 23        """Deleting entries with personal information.
 24        If internaluse = True, keeping details on affiliation.
 25        If outpath is provided, anonymized dataset is written to xls."""
 26
 27        if "Name" in self.data.columns:
 28            self.data = self.data.drop(
 29                columns=[
 30                    "Name",
 31                    "email",
 32                    "Invitation number",
 33                    "Contribution ID",
 34                    "User name",
 35                    "Languages",
 36                ]
 37            )
 38
 39        if (
 40            not internaluse
 41            and "Further description of your role or group" in self.data.columns
 42        ):
 43            self.data = self.data.drop(
 44                columns=[
 45                    "Your institution / organisation / group (for internal purposes only, information will not be public).",
 46                    "Further description of your role or group",
 47                ]
 48            )
 49
 50        if outpath:
 51            self.data.to_hdf(outpath, key="survey")
 52
 53    def _adapt_column_names(self):
 54        """Adapt entries and columns of the table"""
 55
 56        for colname in self.data.columns:
 57            colname_new = str(colname).replace("applicable:", "applicable ")
 58            colname_new = str(colname).replace(": Confidence", "- Confidence")
 59            self.data = self.data.rename(columns={colname: colname_new})
 60
 61    def _create_question_metadata(self):
 62        """Extracting metadata from the survey file.
 63        For each question, an identifier (A...) is defined, and subquestions grouped."""
 64        questions = {}
 65
 66        for colname in self.data.columns:
 67            subtexts = colname.split(":")
 68            questionid = False
 69            for qid in questions:
 70                if questions[qid]["question"] == subtexts[0]:
 71                    questionid = qid
 72            if not questionid:
 73                if len(subtexts) == 1:
 74                    subtexts[0] = colname
 75                questions.setdefault(
 76                    "A" + str(len(questions)),
 77                    {
 78                        "question": subtexts[0],
 79                        "subquestions": [],
 80                        "colnames": [],
 81                        "entrytype": "undefined",
 82                        "params": {},
 83                    },
 84                )
 85                qid = "A" + str(len(questions) - 1)
 86
 87            if len(subtexts) > 1:
 88                questions[qid]["subquestions"].append(subtexts[1 : len(subtexts)])
 89                questions[qid]["colnames"].append(colname)
 90        self.metadata = questions
 91
 92    def preprocess(self):
 93        """Preprocessing the survey data for use"""
 94
 95        self._anonymize()
 96        self._adapt_column_names()
 97        self._create_question_metadata()
 98
 99        self._identify_question_types()
100
101        self._adapt_answers()
102
103    def _identify_question_types(self):
104        """Identifying question types according to metadata"""
105
106        for qid in self.metadata:
107
108            columns = []
109            self.metadata[qid]["params"].setdefault("subtypes", {})
110            self.metadata[qid]["params"].setdefault("options", {})
111
112            if len(self.metadata[qid]["colnames"]) == 0:
113                columns = [self.metadata[qid]["question"]]
114            else:
115                columns = self.metadata[qid]["colnames"]
116                self.metadata[qid]["entrytype"] = "multiple"
117
118            for colname in columns:
119                columntype = ""
120
121                answersall = [x for x in self.data[colname] if pd.notna(x)]
122                uniqueanswers = list(set(answersall))
123
124                dtype = self.data[colname].dtype
125
126                if str(dtype) == "float64":
127                    columntype = "undefined"
128
129                elif str(dtype) == "datetime64[ns]":
130                    columntype = "date"
131
132                elif len(answersall) == 0:
133                    columntype = "no answers"
134
135                elif isinstance(uniqueanswers[0], str):
136                    columntype = "text"
137
138                    subanswers = []
139                    samelength = True
140                    entrylength = 1
141                    notsingle = False
142
143                    for answers in answersall:
144                        answerssplit = answers.split(";")
145                        if entrylength == 1:
146                            entrylength = len(answerssplit)
147
148                        if answers == answersall[0] and len(answerssplit) > 1:
149                            notsingle = True
150
151                        if len(answerssplit) != entrylength and samelength:
152                            samelength = False
153
154                        for subans in answerssplit:
155                            if not subans.lstrip(" ") in subanswers:
156                                subanswers.append(subans.lstrip(" "))
157
158                    # see if answers contain /10 -> rating
159
160                    if (
161                        str(uniqueanswers[0]).find("/5") > 0
162                        or str(uniqueanswers[0]).find("/10") > 0
163                    ) and str(uniqueanswers[0]).find("ttp") < 0:
164                        columntype = "rating"
165
166                    # see if entries can be split with different length ->
167                    # multichoice
168
169                    elif entrylength > 1 and not samelength:
170                        columntype = "enumerate"
171
172                    # see if entries can be split with same length -> ranking
173
174                    elif entrylength > 1 and samelength and notsingle:
175                        columntype = "ranking"
176
177                    # see if entries are repetitive -> select
178
179                    elif entrylength == 1 and len(subanswers) < len(answersall):
180                        columntype = "select"
181
182                    if columntype != "text":
183                        if columntype == "rating":
184                            self.metadata[qid]["params"]["options"].setdefault(
185                                "factor", int(str(uniqueanswers[0]).split("/")[1])
186                            )
187                        else:
188                            self.metadata[qid]["params"]["options"].setdefault(
189                                colname, subanswers
190                            )
191
192                if not columntype:
193                    columntype = "undefined"
194
195                if self.metadata[qid]["entrytype"] == "multiple":
196                    self.metadata[qid]["params"]["subtypes"].setdefault(
197                        colname, columntype
198                    )
199                else:
200                    self.metadata[qid]["entrytype"] = columntype
201
202    def _adapt_answers(self):
203        """Change the entries in the data table for better processing"""
204        pass
205
206    def list_questions(self, filter_type=""):
207        """Returns DataFrame with metadata of the questions.
208        If filter_type provides type of question, only matching entries are listed
209        """
210
211        meta_df = pd.DataFrame(self.metadata).T
212        if filter_type:
213            meta_df = meta_df[meta_df["entrytype"] == filter_type]
214        return meta_df
215
216    def get_question_data(self, questionid):
217        """Returns dataframe according to question identifier"""
218        colnames = []
219        if len(self.metadata[questionid]["colnames"]) > 0:
220            colnames = self.metadata[questionid]["colnames"]
221        else:
222            colnames = [self.metadata[questionid]["question"]]
223
224        return self.data.loc[:, colnames]
225
226    def _get_colnames(self, questionids, acceptedtypes=[]):
227        """Returns column names for given question IDs, facilitating filtering for specific types."""
228
229        colnames = []
230
231        for qid in questionids:
232            if (
233                acceptedtypes
234                and self.metadata[qid]["entrytype"] in acceptedtypes
235                or not acceptedtypes
236            ):
237                colnames.append(self.metadata[qid]["question"])
238            elif self.metadata[qid]["entrytype"] == "multiple":
239                for colname in self.metadata[qid]["params"]["subtypes"]:
240                    if (
241                        self.metadata[qid]["params"]["subtypes"][colname]
242                        in acceptedtypes
243                    ):
244                        colnames.append(colname)
245            else:
246                print(
247                    "For question-ID",
248                    qid,
249                    "the type is not an accepted type ",
250                    acceptedtypes,
251                    " or 'multiple', will not add type",
252                    self.metadata[qid]["entrytype"],
253                )
254
255        return colnames
256
257    def extract_subset(self, questionids, acceptedtypes=[]):
258        """Returns dataframe with subset of data according to questionids, and filters for provided types of questions."""
259        colnames = self._get_colnames(questionids, acceptedtypes)
260
261        return self.data[colnames]
DataSet(filepath)
 7    def __init__(self, filepath):
 8        self.metadata = {}
 9        self.data = pd.DataFrame()
10
11        self.read(filepath)
12        self.preprocess()
metadata
data
def read(self, filepath):
14    def read(self, filepath):
15        """Reading the survey results from the export from EU survey"""
16
17        if filepath.find("xls") > -1:
18            self.data = pd.read_excel(filepath, header=3)
19        elif filepath.find("h5") > -1:
20            self.data = pd.read_hdf(filepath, key="survey")

Reading the survey results from the export from EU survey

def preprocess(self):
 92    def preprocess(self):
 93        """Preprocessing the survey data for use"""
 94
 95        self._anonymize()
 96        self._adapt_column_names()
 97        self._create_question_metadata()
 98
 99        self._identify_question_types()
100
101        self._adapt_answers()

Preprocessing the survey data for use

def list_questions(self, filter_type=''):
206    def list_questions(self, filter_type=""):
207        """Returns DataFrame with metadata of the questions.
208        If filter_type provides type of question, only matching entries are listed
209        """
210
211        meta_df = pd.DataFrame(self.metadata).T
212        if filter_type:
213            meta_df = meta_df[meta_df["entrytype"] == filter_type]
214        return meta_df

Returns DataFrame with metadata of the questions. If filter_type provides type of question, only matching entries are listed

def get_question_data(self, questionid):
216    def get_question_data(self, questionid):
217        """Returns dataframe according to question identifier"""
218        colnames = []
219        if len(self.metadata[questionid]["colnames"]) > 0:
220            colnames = self.metadata[questionid]["colnames"]
221        else:
222            colnames = [self.metadata[questionid]["question"]]
223
224        return self.data.loc[:, colnames]

Returns dataframe according to question identifier

def extract_subset(self, questionids, acceptedtypes=[]):
257    def extract_subset(self, questionids, acceptedtypes=[]):
258        """Returns dataframe with subset of data according to questionids, and filters for provided types of questions."""
259        colnames = self._get_colnames(questionids, acceptedtypes)
260
261        return self.data[colnames]

Returns dataframe with subset of data according to questionids, and filters for provided types of questions.