"""Another library of useful utility routines for SPSS""" import spss, spssaux, spssdata import sys import time, os, re, fnmatch, glob, copy from spssaux import _smartquote from spssaux import u ok1600 = spssaux.getSpssVersion() >= [16,0,0] ###import pywin.debugger, sys, time ###import traceback #Copyright (C) 2005-2008 by SPSS Inc. # JKP #History # 20-jan-2006 Add FindFilesWithVars function # 05-Dec-2006 Add FindEmptyVars function # 15-Dec-2006 Add FindUnlabelledValues function # 12-Jan-2006 Add genCategoryList function # 07-Feb-2007 Add rankvarsincase function # 03-Jul-2007 Add support for weights to genCategoryList # 24-Jul-2007 Add mergeByLabel function to add cases with renaming based on variable labels # 02-Nov-2007 Add dupVarnameCheck function to expand and check variable lists # 07-Jan-2008 Add genCategoriesWSubtotals and CopyValueSets # 01-Apr-2008 Add getVarValues and generalizedSplit # 13-Jun-2008 Add genValueLabels # 19-Aug-2008 Add applySyntaxToFiles # 9-Sep-2008 Add labelseparator option to CreateBasisVariables and robustify syntax. __author__ = 'spss' __version__= '1.8.1' class Error(Exception): pass def _safeval(val, quot): "return safe value for quoting with quot, which may be single or double quote or blank" return quot == " " and val or val.replace(quot, quot+quot) def CreateBasisVariables(varindex, root, maxvars=None, usevaluelabels=False, macroname=None, order="A", labelseparator=" = "): """Create a set of dummy variables that span the values of variable with index varindex (within any current filter) . varindex can be an int or a Variable object (from a VariablDict) This function works for numeric and string variables except those with date or time format. The new variables are named root n, starting with n=1 in ascending value order (by default). Any existing variables with these names are overwritten. Each variable is labeled with the underlying variable name and the value it represents. If usevaluelabels is True, the value label, if any, is used in place of the value. Since the new label will contain the name of the underlying variable, the underlying label may be truncated. Missing values are ignored. If maxvars is specified, no more than maxvars will be created. The function returns the number of variables created. If macroName is specified, an SPSS macro with that name will be produced containing the names of the created dummy variables omitting root 1 (making the reference category the first one). If order="D", the categories are in descending order, and the reference (omitted) category will the last one. Generated labels have the form variable = value by default. labelseparator can be used to choose a different separator, e.g., labelseparator=": ". The function returns the maximum n for the root variables created. """ varindex = int(varindex) varname = spss.GetVariableName(varindex) vartype = spss.GetVariableType(varindex) xptail, quot = vartype == 0 and ("@number", " ") or ("@string", "\"") xpath ="//pivotTable[@subType='Frequencies']/dimension/group[1]//category/" + xptail try: tag, ignore = spssaux.CreateXMLOutput("FREQUENCIES " + varname + (order=="D" and "/FORMAT DVALUE" or "") +"/statistics none.") freqvalues = spss.EvaluateXPath(tag, '/outputTree', xpath) if maxvars: freqvalues = freqvalues[:maxvars] spss.DeleteXPathHandle(tag) if len(freqvalues) == 0: raise Error, "No variable values found" if usevaluelabels: vlabels = spssaux.GetValueLabels(varindex) for item in vlabels: vlabels[item] = vlabels[item].replace('"', '""') #ensure dbl quoted text can be quoted compexpr = "COMPUTE %s_%d = %s EQ " + quot + "%s" + quot + "." labelexpr = """VARIABLE LABEL %s_%d "%s%s%s".""" genlist = [compexpr%(root, i, varname, _safeval(v, quot)) for i, v in enumerate(freqvalues)] if usevaluelabels: labellist = [labelexpr%(root, i, varname, labelseparator, vlabels.get(v,v)) for i, v in enumerate(freqvalues)] else: labellist = [labelexpr%(root, i, varname, labelseparator, v) for i, v in enumerate(freqvalues)] spss.Submit(genlist) spss.Submit(labellist) if macroname: spss.SetMacroValue(macroname, " ".join([root + '_' + str(i) for i in range(1, len(freqvalues))])) return len(freqvalues) except: print sys.exc_info() raise def CreateFileNameWDate(basename=None): """Create a filename, including path, of the form base_datetime.ext where datetime is the current date and time in a file system safe format of YYYY-MM-DD_HH-MM. If the basename is not specified, the filename of the active dataset is used. If there is none, ValueError is raised. If basename already contains a datetime stamp at the end (but before the extension), it is removed.""" # create string with current date and time in format YYYY-MM-DD-HH-MM dt = time.strftime("_%Y-%m-%d_%H-%M") if not basename: basename= spssaux.GetDatasetInfo("Data") if not basename: raise ValueError("Default name not available") root, ext = os.path.splitext(basename) # remove datestamp if present datestamp = re.search(r"_\d{4}-\d{2}-\d{2}_\d{2}-\d{2}\Z",root) if datestamp: root = root[:datestamp.start()] return root + dt + ext def FindMostRecentFile(basename): """Return the full filename, including path if included in basename, of the most recent file matching basename or None if there is no match. Matching means the filename equals the basename with or without a timestamp. A timestamp has the format _YYYY-MM-DD_HH-MM just before the extension. If the basename already contains a timestamp, it is checked with and without it. The timestamp pattern conforms to what CreateFileNameWDate produces. """ # date and time in format YYYY-MM-DD-HH-MM is what would be produced by # time.strftime("_%Y-%m-%d_%H-%M") root, ext = os.path.splitext(basename) # remove timestamp if present timestamp = re.search(r"_\d{4}-\d{2}-\d{2}_\d{2}-\d{2}\Z",root) if timestamp: root = root[:timestamp.start()] candidates = glob.glob(root+"_*"+ext) # select only items matching the timestamp pattern, if any candidates=[item for item in candidates if re.search(r"_\d{4}-\d{2}-\d{2}_\d{2}-\d{2}\Z", os.path.splitext(item)[0])] # include original filespec with and without any timestamp if it exists # exclude items that are not files candidates = \ filter(os.path.isfile, candidates\ + filter(os.path.exists, [basename, root+ext])) if not candidates: return None # find most recent candidates = zip(map(os.path.getmtime, candidates),candidates) candidates.sort(reverse=True) return candidates[0][1] # Search selected sav files for selected variable names or other variable attributes def FindFilesWithVars(drivepath="c:/", searchlist=[".*"],infotype='names', filenames=r"*.sav", silent=False): """Print a report listing files searched and which ones contain all of the searchlist items specified. Returns list of matching files. drivepath is a location that defines the root of the search (defaults to c:/). searchlist is a list of regular expressions of variable information to search for (defaults to any, which will list all SPSS data files found). infotype is a string that determinesthe type of information to search. It can be names (the default) labels measurementlevels formats types (0=numeric, >0 = string of specified length filenames is a wildcard expression for filenames to search (defaults to *.sav). If silent is True, nothing is printed.""" infotypes=['names','labels','measurementlevels','formats','types'] infoapis=[spss.GetVariableName, spss.GetVariableLabel, spss.GetVariableMeasurementLevel, spss.GetVariableFormat, spss.GetVariableType] try: infof = infoapis[infotypes.index(infotype)] except: raise ValueError, "Invalid type of information to search for: "+infotype matches = [] searchlist = [str(item) for item in searchlist] if not silent: print "searching directory tree " + drivepath + " and filenames like " + filenames print "for files containing all of the ", infotype+":\n" +\ "\n".join(searchlist) searchlistregex = [re.compile(v, re.IGNORECASE) for v in searchlist] f = os.walk(drivepath) #create generator function for (dirname, subdirs, files) in f: for sav in files: if fnmatch.fnmatch(sav, filenames): if not silent: print "searching: ", dirname + "/"+ sav try: spss.Submit("get file='"+dirname+"/"+sav+"'") savsearchlist=_getinfo(infof) if _matchlist(searchlistregex, savsearchlist): if not silent: print "***matched:", dirname + "/" + sav matches.append(dirname + "/" + sav) except: pass return matches def _getinfo(f): """return a list of variable information of requested type in current data file""" kt = spss.GetVariableCount() vl = [] for i in xrange(kt): vl.append(str(f(i))) return vl def _matchlist(regex, texts): """regex is a list of compiled regular expressions. texts is a list of texts. matchlist returns True iff all items in regex match some item in texts.""" for x in regex: for t in texts: if x.match(t): break; else: return False; #x matches nothing in texts return True def FindEmptyVars(vars=None, delete=False, alpha=True): """Scan specified or all variables and determine which are missing or blank for all cases. Return list of names and optionally delete those variables. vars is a list of the (zero-based) index numbers of the variables to check. It can also be a single string of blank-separated numbers or a VariableDict object. By default, all variables are checked. A value is considered empty if it is sysmis or user missing. String variables are also considered empty if their values are all blank. delete specifies whether empty variables should be deleted or not. The return value is a possibly empty list of variable names, not index numbers. Split files should be off when this function is used. If alpha is False, string variables are excluded from the checked list. This is true even if the variable was listed in the vars parameter. Examples: # find but do not delete the empty variables print FindEmptyVars() # use a VariableDict object and do the same thing vard = spssaux.VariableDict() print FindEmptyVars(vars=vard) # use a string of variable numbers and do the same thing but delete the empty variables strvars = [] for v in vard: strvars.append(str(int(v))) strvars = " ".join(strvars) print FindEmptyVars(vars=strvars, delete=True) """ if vars is None: # check all variables (except as governed by alpha) stillstanding = set(xrange(spss.GetVariableCount())) else: vars = spssaux._buildvarlist(vars) stillstanding = set([int(v) for v in vars]) #int ensures that names were not supplied and extracts index from VariableDict objects alphavars = set([]) for v in stillstanding: if spss.GetVariableType(v) > 0: alphavars.add(v) if not alpha: stillstanding.difference_update(alphavars) # remove alpha vars as candidates for deletion if only checking numerics curs = spss.Cursor() try: while True: case= curs.fetchone() if case is None: case = curs.fetchone() #to allow for split file processing if case is None or len(stillstanding) == 0: break for v in stillstanding.copy(): if v in alphavars: if not (case[v] == None or case[v].strip() == ""): stillstanding.discard(v) else: if not case[v] is None: stillstanding.discard(v) finally: curs.close() killlist = [spss.GetVariableName(v) for v in stillstanding] if delete and len(killlist) > 0: spss.Submit("DELETE VARIABLES " + " ".join(killlist)) return killlist def FindEmptyNumericVars(vars=None, delete=False): """Scan specified or all numeric variables and determine which are missing for all cases. Return list of names and optionally delete those variables. vars is a list of the (zero-based) index numbers of the variables to check. It can also be a single string of blank-separated numbers or a VariableDict object. By default, all numeric variables are checked. Any string variables are silently ignored. delete specifies whether empty variables should be deleted or not. The return value is a possibly empty list of variable names. """ return FindEmptyVars(vars, delete, alpha=False) def FindUnlabelledValues(vardict): """Find all unlabelled values of specified set of variables if the variable has any value labels defined. Returns a dictionary where keys are variable names and each value is a possibly empty list of values with no label (other than sysmis). If there are no variables to check, e.g., because no specified variables have any value labels, an empty dictionary is returned. vardict is an spssaux VariableDict object specifying the variables to check. split files should be off for this function. This function requires at least version 2.0.2 of spssaux.""" # This function has to do an OMS operation for each variable in order to get the value labels. This could be slow. varstocheck = [] # list of variable names labelledvalues = [] # list of sets of labelled values allvalues = [] # list of sets of values try: for v in vardict: lbls = v.ValueLabelsTyped if len(lbls) > 0: varstocheck.append(v.VariableName) labelledvalues.append(set(lbls.keys())) allvalues.append(set()) except AttributeError: raise "This function requires at least version 2.0.2 of the spssaux module." if varstocheck == []: return {} try: curs = spssdata.Spssdata(indexes = varstocheck, names=False) for case in curs: for i, val in enumerate(case): if not val is None: allvalues[i].add(val) finally: curs.CClose() # construct list of unlabelled values unlabelled = {} for i in xrange(len(varstocheck)): unlabelled[varstocheck.pop()] = list(allvalues.pop() - labelledvalues.pop()) return unlabelled import operator, spss, spssaux, spssdata def genCategoryList(varnames, specialvalues=None, macroname=None, missing='EXCLUDE', order='D', weightvar=None): """Generate and return sorted list(s) of values with possible insertion of extra values. Optionally create SPSS macros. varnames is a sequence of variable names to process. It can also be a blank-separated string of variable names. specialvalues is a sequence of values that should be inserted before the first zero count or at the end if no zeros or None. If a special value already occurs in a varname, it will be moved. Note that for string variables, special values need to have the same width as the variable, including leading and trailing blanks. macroname is a list of macronames of the same length as varnames to generate or None. missing is 'INCLUDE' or 'EXCLUDE' to determine whether user missing values are included or excluded. order is 'A' or 'D' to specify the sort direction. weightvar can be specified as a variable name to be used as a weight in determing the counts to sort by. It must not occur in varnames. This function is mainly useful as a helper function for Ctables in building CATEGORIES subcommands. It may be useful to combine it with OTHERNM and/or MISSING in the category list. """ varnames = spssaux._buildvarlist(varnames) if macroname: macroname = spssaux._buildvarlist(macroname) if len(varnames) != len(macroname): raise ValueError, "Number of variables does not match number of macro names" if not missing in ['INCLUDE','EXCLUDE']: raise ValueError, "missing specification must be 'INCLUDE' or 'EXCLUDE'" if weightvar: #if weightvar in varnames: # raise ValueError, "weightvar cannot be included in varnames" varnamesAndWeight = varnames + [weightvar] else: varnamesAndWeight = varnames curs = spssdata.Spssdata(indexes=varnamesAndWeight, names=False, convertUserMissing= missing=='INCLUDE') nvar = len(varnames) vvalues=[{} for i in range(nvar)] # for accumulating counts for all variable values for cn, case in enumerate(curs): casecpy = copy.copy(case) if weightvar: w = casecpy[nvar] if w is None: w = 0.0 else: w = 1.0 for i in range(nvar): if not casecpy[i] is None: # omit sysmis values and optionally user missing values curval = casecpy[i] vvalues[i][curval] = vvalues[i].get(curval,0.) + w # count occurrences, possibly weighted curs.CClose() valuelist = [] for i in range(nvar): if not specialvalues is None: # remove special values from count list for v in specialvalues: if v in vvalues[i]: del(vvalues[i][v]) valuelist.append(sorted([(value, key) for (key, value) in vvalues[i].iteritems()], reverse = order == 'D')) if not specialvalues is None: for j in range(len(valuelist[i])): if valuelist[i][j][0] == 0: valuelist[i] = valuelist[i][:j] + [(None, v) for v in specialvalues] + valuelist[i][j:] break else: valuelist[i].extend([(None, v) for v in specialvalues]) if macroname: if isinstance(valuelist[i][0][1], basestring): qchar = '"' else: qchar = '' spss.SetMacroValue("!" + macroname[i], " ".join([qchar + str(k) + qchar for (value, k) in valuelist[i]])) return valuelist # get a variable specification # get value labels # accept list of values for subtotals # generate category list for /category # save that as a macro def genCategoriesWSubtotals(varname, subtotallist, macroname, subtotallabel="", specificlabels={}, sort='values', order='a', position='after'): """Define a macro for CTABLES containing all labelled categories with inserted subtotal specifications and return the macro value. This function generates a category list based on value labels with regular and hiding subtotal specifications at selected points. It is useful for tables where a subtotal should not include all the categories preceding (or following) it since the previous subtotal. Note that with such subtotals, it is important to supply a clear label so that the table is not misleading. varname is the variable name whose labels will be used. It is assumed to be numeric. subtotallist is a sequence of values of the appropriate type that should be followed by a subtotal. The subtotal will be a regular subtotal except when there is only one value being subtotaled, in which case it will be a hiding subtotal and the label will be the category label regardless of other settings below. macroname is the name of the macro to generate. If a subtotal label should be specific to the value in subtotallist, include it in a dictionary as specificlabels where the keys are the values of the appropriate type and the values are the specific label. The subtotallabel or default will be used when there is no specific label. If a specific label is given but the value is not in the subtotal list, it is ignored. If subtotallabel is specified, non-hiding subtotals will be labelled with that string By default, the categories are sorted by the category values. If sort is 'labels', the categories are sorted by the value label. order='a' or 'd' can be used to control the direction of the sort. If position is 'after' the category list is set for following subtotals; if 'before', it is set for preceding subtotals. #TODO: make this happen. If there is only one category in a subtotal, make it a hiding subtotal and use the category label as the subtotal label. Examples: genCategoriesWSubtotals("education", [12,16,20], '!categories') genCategoriesWSubtotals("education", [12,13, 16,20], "!categoriesSpLabels", subtotallabel="Together", specificlabels={13:"thirteen", 16:"sixteen"}) """ if sort not in ['values','labels'] or order not in ['a','d'] or position not in ['after','before']: raise ValueError("invalid value for function parameter") if ok1600 and spss.PyInvokeSpss.IsUTF8mode(): unistr = unicode else: unistr = str catlist = [] catcount = 0 # addt will either append to catlist or insert at the beginning if position == "after": def addt(x): catlist.append(x) else: def addt(x): catlist.insert(0,x) vardict = spssaux.VariableDict(namelist=varname) isstr = vardict[0].VariableType vallabeldict = vardict[0].ValueLabels # get list of labeled values and convert to appropriate type so that they will sort correctly according to the type if isstr: vallabels = [v for v in vallabeldict] else: vallabels = [int(v) for v in vallabeldict] if sort == 'values': values = sorted(vallabels, reverse= order=='D') else: values = sorted(vallabels, key= lambda k: vallabeldict[unistr(k)], reverse=order=='D') for v in values: if isstr: addt(_smartquote(v)) else: addt(unistr(v)) catcount = catcount + 1 if v in subtotallist: if catcount == 1: addt('HSUBTOTAL="%s"' % vallabeldict[unistr(v)]) catcount = 0 else: stlabel = _smartquote(specificlabels.get(v, subtotallabel)) if stlabel != '""': stlabel = "=" + stlabel else: stlabel = "" addt("SUBTOTAL" + stlabel) catcount = 0 spss.SetMacroValue(macroname, " ".join(catlist)) return catlist def copyValueSubsets(fromvar, tovars, duplelist, vardict=None, createVars=True): """Copy value labels and optionally values from fromvar to tovars according to partition in duplelist. This function is useful in adding flexibility to Ctables. It maps the values and value labels from one variable into a set of variables with each variable receiving the value labels, and optionally the values, in a selected range. It can create the set of variables or it can just apply the value labels. fromvar is the source variable. tovars is a sequence of one or more variables or a simple, blank-separated string listing the variables. Variables in tovars are assumed to have the same variable type as fromvar. duplelist is a list of pairs of values, one pair per variable in tovars, specifying the range of values whose labels and optionally values should be copied. Values in duplelist must match the type of fromvar. Both endpoints are included. Existing value labels for tovars are replaced. If vardict is not None, it is a VariableDict object containing fromvar. If not supplied, one is created. if createVars is true, the variables listed in tovars are created according to the specifications in duplelist. They must not already exist, and the measurement level is copied from fromvar. Cases where the values are outside the selected range for a variable will have sysmis values. Examples: copy values and value labels from variable educ into educ1, educ2, educ3. educ1 will contain values and labels in the inclusive range 0, 10, educ2 will include the range 0,14, and educ3 will have 12,16 Usually the ranges will partition the values, but overlaps and omissions are permitted. copyValueSubsets("educ", "educ1 educ2 educ3", [(0,12), (13,16), (17,99)]) copy values and labels and create overlapping totals in Ctables: The table will tabulate educ with jobcat. It will have totals for each educ range and an overlapping total for the range 0,16 and a grand total. copyValueSubsets("educ", "educ1 educ2 educ3", [(0,12), (13,16), (17,99)]) spss.Submit("IF NVALID(educ1, educ2) > 0 educ12 = 1") spss.Submit("COMPUTE grand = 1") spss.Submit("ctables /table (educ1+educ2 + educ12 + educ3 + grand)[C] BY jobcat/categories variables=educ1 educ2 educ3 total=yes.") """ tovars = spssaux._buildvarlist(tovars) if not len(tovars) == len(duplelist): raise ValueError("The number of range duples is different from the number of target variables") if not vardict: vardict = spssaux.VariableDict(namelist=fromvar) isstrvar = vardict[0].VariableType > 0 fromvarvl = vardict[fromvar].ValueLabels if createVars and isstrvar: spss.Submit("STRING %s (%s)" % (" ".join(tovars), vardict[fromvar].VariableFormat)) for i, v in enumerate(tovars): lolim, hilim = duplelist[i] vn = tovars[i] if isstrvar: lowfmt, hifmt = _smartquote(lolim), _smartquote(hilim) values = [v for v in fromvarvl if lolim <= v <= hilim] vlspec = [_smartquote(v) + " " + _smartquote(fromvarvl[v]) for v in values] else: lowfmt, hifmt = lolim, hilim values = [v for v in fromvarvl if lolim <= float(v) <= hilim] vlspec = [v + " " + _smartquote(fromvarvl[v]) for v in values] if createVars: spss.Submit("IF %(fromvar)s >= %(lowfmt)s AND %(fromvar)s <= %(hifmt)s %(vn)s = %(fromvar)s." % locals()) spss.Submit("VALUE LABELS %s " % vn + " ".join(vlspec)) if createVars: spss.Submit("VARIABLE LEVEL %s (%s)" % (" ".join(tovars), vardict[fromvar].VariableLevel)) def rankvarsincase(varlist, suffix="_rank"): """create a set of new variables with values matching the rank of variables in varlist. This function requires SPSS 15 or later. varlist is a list, string or variable dictionary of the variables to sort. suffix is a string to append to the names in varlist for the new variables. If the new names match an existing variable, an exception will be raised. Default value is "_rank" Missing values are ranked low.""" varlist = spssaux._buildvarlist(varlist) indexes = range(len(varlist)) curs = spssdata.Spssdata(indexes=varlist, accessType='w', names=False, maxaddbuffer = 8 * len(varlist)) try: for v in varlist: curs.append(spssdata.vdef(v + suffix, vlabel="Rank for " + v, vfmt=("F", 4, 0))) curs.commitdict() for case in curs: curs.casevalues([i for (val, i) in sorted(zip(case, indexes))]) finally: curs.CClose() # match files based on variable labels import spss, spssaux def mergeByLabel(firstds, secondds): """Merge cases from two open datasets renaming the variables in secondds according to the variable labels in firstds to have the firstds names. Unlabeled variables are not renamed. If secondds has a duplicate label and firstds contains that label, the renaming would be ambiguous, so the variable is not renamed. No check is made for variable type compatibility. If renaming would create duplicate variable names, a ValueError exception is raised. This could be caused by ambiguity or partial labelling.""" spss.Submit("DATASET ACTIVATE %s" % secondds) vardict2 = spssaux.VariableDict() labeldict2 = {} for v in vardict2: vl = v.VariableLabel if vl != '': if not vl in labeldict2: labeldict2[vl] = [v.VariableName, 1] else: labeldict2[vl][1] = labeldict2[vl][1]+1 spss.Submit("DATASET ACTIVATE %s" % firstds) labeldict1 = dict([(item.VariableLabel, item.VariableName) for item in spssaux.VariableDict()]) try: del labeldict1[""] except: pass for item in labeldict1: varcount = labeldict2.get(item, ['',1])[1] if varcount > 1: #multiple variables share same label print "label assigned to %d variables in second dataset. Renaming will not occur: '%s'" % (varcount, item) del labeldict2[item] renamein = [] renameout = [] for (label, name) in labeldict1.items(): if not label in labeldict2: print "Variable label not found in second dataset or duplicate label. Variable will not be renamed:", name, label elif name in renameout: print "Variable label not unique in second file:", name, label elif name != labeldict2[label][0]: renamein.append(labeldict2[label][0]) renameout.append(name) # check for duplicate names and fail if any found unrenamed = set(vardict2.variables) - set(renamein) dups = unrenamed.intersection(set(renameout)) if len(dups) > 0: print "Merge stopped: Renaming would create the following duplicate names:\n", "\n".join(sorted(dups)) raise ValueError, "Duplicate Names" if len(renamein) > 0: renamesubcmd = "/RENAME=(" + " ".join(renamein) + "=" + " ".join(renameout) + ")" print "\nRename mapping for dataset %s:" % secondds for inname, outname in zip(renamein, renameout): print inname, "-->", outname else: renamesubcmd = "" cmd = r"""ADD FILES /FILE=* /FILE='%(secondds)s' %(renamesubcmd)s.""" % locals() spss.Submit([cmd, "EXECUTE."]) def dupVarnameCheck(vardict, vlist): """Check the variable list in vlist for duplicates against the VariableDict object vardict. Return a duple of a list of duplicates and the expanded list. TO constructs are resolved against the variable dictionary. Case is ignored in these checks. The expanded list is a list containing individual items and lists where TO was used. Duplicates are not removed from the expanded list in order to facilitate finding the source. vardict is a VariableDict object to be used for expanding TO constructs. vlist is the variable list to be checked as a sequence or a string. If a name is not found in the dictionary, an exception is raised. """ vlist = spssaux._buildvarlist(vlist) varset = set() size = len(vlist) dups = [] variables = vardict.variables variablesUC = [v.upper() for v in variables] expandedlist = [] def fixcase(vname): """closure to correct case of variable vname against a dictionary variable list variables. Return the corrected name""" if vname in variables: return vname else: for i, vv in enumerate(variablesUC): if vname.upper() == vv: return variables[i] raise ValueError("No such variable: " + vname) for i, v in enumerate(vlist): if i < size - 1 and vlist[i+1].upper() == "TO": varlist = vardict.range(fixcase(v), fixcase(vlist[i+2])) expandedlist.append("[" + ", ".join(varlist) + "]") for vv in varlist: if vv.upper() in varset: dups.append([vv, [v, vlist[i+1], vlist[i+2]]]) else: varset.add(vv.upper()) vlist[i+1] = "" vlist[i+2] = "" elif not v == "": expandedlist.append(fixcase(v)) if v.upper() in varset: dups.append(v) else: varset.add(v.upper()) return (dups, expandedlist) def getVarValues(varname, vartype=None, missing='exclude'): """Return a list of the values of variable varname. varname is the variable to tabulate. The name must match the case of the name in SPSS. vartype is the variable type: 0 for numeric and >0 for string. If not supplied, it will be determined by this function. By default, user missing values are excluded. Specify missing="include" to include them. System-missing values are never included""" if not missing in ["include", "exclude"]: raise ValueError, "Missing-value specification must be include or exclude" if vartype is None: vartype = spssaux.VariableDict(namelist=varname)[varname].VariableType xptail, quot = vartype == 0 and ("@number", " ") or ("@string", "\"") xpath ="//pivotTable[@subType='Frequencies']/dimension/group[1]//category/" + xptail cmd = "FREQUENCIES " + varname +"/statistics none" if missing == "include": cmd += "/MISSING=INCLUDE" tag, ignore = spssaux.CreateXMLOutput(cmd) freqvalues = spss.EvaluateXPath(tag, '/outputTree', xpath) spss.DeleteXPathHandle(tag) return freqvalues def generalizedSplit(splitvar, cmd, vartype=None, missing="exclude", errorContinue=True): """Execute cmd for cases having each value of splitvar and return error count. splitvar is a string or numeric SPSS variable. For each value of splitvar, the SPSS command in cmd is executed for those cases matching that value. Although any variable can be specified, the nature of floating point processing means that numeric variables should have only integer values. It is not necessary that the file be sorted by splitvar. The variable named as splitvar must match the name of an SPSS variable including its case. cmd is the SPSS command to execute. Within that command, the current value of splitvar can be used by including %(splitvalue)format-code in the string. The command can also use %(count)format-code to refer to a counter with value of the current iteration. count is zero-based. For example, cmd could be SAVE OUTFILE="c:/temp/output%(splitvalue)d.sav" to include the current integer-valued value of the numeric split variable in the filename or SAVE OUTFILE="c:/temp/output%(count)d.sav" to number the output files starting from zero. Any format codes can be used, but d for a numeric variable and s for a string variable are likely to be the most useful. vartype is optional but can be used to specify the type of the SPSS variable. 0 indicates a numeric variable, and values greater than 0 indicate a string. If not specified, the type will be determined automatically. By default, user and system missing values are excluded from the iterations. Specify missing='include' to include user missing values. System-missing values are always excluded. errorContinue indicates whether the iterations through the values of splitvar should continue or stop if an error occurs. By default, iterations continue. Specify errorContinue=False to stop on error. The function returns a count of the errors. If errorContinue=False, this value will always be 0 or 1. While this function is more general than the built-in split files mechanism, when split files can be used, that mechanism may be faster. split files requires a sort (n log n time) but then only one data pass for a command, and the sort may be amortized over many commands. This function does not require a sort but makes K+1 data passes, where K is the number of distinct values of the splitting variable. """ if vartype is None: vartype = spssaux.VariableDict(namelist=splitvar)[splitvar].VariableType varvalues = getVarValues(splitvar, vartype, missing) cmdprefix="""TEMPORARY. SELECT IF %(splitvar)s = %(selectvalue)s. """ errcount = 0 count = -1 for splitvaluestr in varvalues: count += 1 selectvalue = splitvaluestr try: splitvaluestr = float(splitvaluestr) except: selectvalue = spssaux._smartquote(splitvaluestr) try: spss.Submit((cmdprefix + cmd) % {"splitvar": splitvar, "splitvalue": splitvaluestr, "selectvalue":selectvalue, "count":count}) except: errcount += 1 if errorContinue: continue else: return errcount return errcount def genValueLabels(targetvar, labelvar, vardict=None): """Generate value labels for targetvar from contents of labelvar. Return conflict state. targetvar is any existing variable. labelvar is a variable whose values should be used to label the values of targetvar. vardict is an optional spssaux.VariableDict object containing at least targetvar. If there are conflicting values, the last one encountered wins. If there are no conflicts, return value is True; if any conflicts, value is False System missing values are not labeled.""" labels = {} noConflicts = True if vardict is None: vardict = spssaux.VariableDict(targetvar) curs = spssdata.Spssdata([targetvar, labelvar]) for case in curs: val = u(case[0]) label = u(case[1]).strip() if not val is None: if val in labels and not label == labels[val]: noConflicts = False labels[val] = label curs.CClose() vardict[targetvar].ValueLabels = labels return noConflicts