"""The Transform class and its subclasses provide a way to create variables and associated metadata in SPSS. They can also go beyond what is readily possible in SPSS in that transformations can be repeatable. This means that a variable can remember its formula and reapply it to new or changed cases. Transform objects support lists of expressions and conditions, but each branch of the condtions must use the same transformation command such as COMPUTE or RECODE. Each subclass must implement at least __init__ and calculate methods. This class simply throws exceptions on error conditions in most cases. Copyright (C) 2005 by SPSS Inc. """ __all__ = [ "smartquote", "smartsplit", "Transform", "Compute", "Count", "retransform", "timestamp" ] __author__ = 'spss' __version__= '1.1' # history # 18-Oct-2005 tighter parameter checking in init # 13-Dec-2005 support variable attributes for Transform objects import spss import spssaux import re, time class Error(Exception): pass def smartquote(s, qchar='"'): """ smartquote a string so that internal quotes are distinguished from surrounding quotes for SPSS and return that string with the surrounding quotes. qchar is the character to use for surrounding quotes.""" return qchar + s.replace(qchar, qchar+qchar) + qchar def smartsplit(s): """smartsplit(s) -> list of items in string s where items are delimited by whitespace and single or double quoted strings are considered a single item TODO: make this work with embedded quotes and other tricks """ slist= re.split("[\"' ]",s) while 1: try: slist.remove('') except: break return slist class Transform(object): """Base class for SPSS transformations. Normally only its subclasses would be instantiated. Standard parameters: varname - variable to create vartype - variable type: "Numeric" or "String" varformat - variable format. Defaults to F8.2 for numeric. No default for strings. varlabel - a string to be used as the variable label. varmeaslvl - variable measurement level in "Nominal", "Ordinal", "Scale" defaults to Scale for Numeric variables and Nominal for String variables varmissval - a list of variable missing values. Up to three values corresponding to SPSS rules. Must match the type of the variable. varattrib - an optional dictionary of attributes and values for the variable. To refer to array attributes, use the subscripted name as a dictionary key. expression - a string or list of strings containing the formula for the variable. If it is a list of size n > 1, there must be a corresponding list of conditions of size n or n-1. If the condition list is shorter, the last expression gets "ELSE". condition - a string or list of strings containing the conditions for the expressions. retransformable- If true, the variable formula is saved in the attribute "T.Formula" for the variable and can be used to recompute its values. Default is False A transform object must have at least a varname and expression specified. Other metadata are created if the parameter is supplied or has a default, but if the variable already exists and has unspecified properties, they are not removed. To replace an existing variable cleanly, it is best to delete it first. """ class Error(Exception): pass def __init__(self, **p): self.condition = [] self.retransformable = False self.varattrib = {} self.varlabel = None for parm in ['varname', 'expression', 'condition', 'varlabel', 'retransformable', 'varattrib']: val = p.pop(parm, None) if val: self.__dict__[parm] = val Transform.setvartype(self, p.pop('vartype', None) or "Numeric") Transform.setvarformat(self, p.pop('varformat', None) or "F8.2") Transform.setvarmeaslvl(self, p.pop('varmeaslvl', None) or "Scale") Transform.setvarmissval(self, p.pop('varmissval', None) or []) return p def getvartype(self): return self.__vartype def getvarformat(self): return self.__varformat def getvarmeaslvl(self): return self.__varmeaslvl def getvarmissval(self): return self.__varmissval def setvarmissval(self, varmissval): """set the list of missing values. varmissval must be a list of length <=3 If a range specification is used, only one is allowed, and it counts as two values. LOWEST and HIGHEST can be used in ranges as values for numeric variables. """ self.__varmissval = [] itemct = 0 for item in varmissval: self.__varmissval.append(str(item)) itemct+= 1 if "THRU" in smartsplit(str(item)): itemct+= 1 if itemct > 3: raise self.Error("Too many missing values: " + str(itemct)) def setvartype(self, vartype): vartype = vartype.upper() if vartype not in ("NUMERIC", "STRING"): raise self.Error("Invalid variable type: " + str(vartype)) self.__vartype = vartype def setvarformat(self, varformat): self.__varformat = varformat def setvarmeaslvl(self, varmeaslvl): varmeaslvl = varmeaslvl.upper() if varmeaslvl not in ("NOMINAL", "ORDINAL", "SCALE"): raise self.Error("Invalid measurement level: " + str(varmeaslvl)) self.__varmeaslvl = varmeaslvl vartype = property(getvartype, setvartype) varformat = property(getvarformat, setvarformat) varmeaslvl = property(getvarmeaslvl, setvarmeaslvl) varmissval = property(getvarmissval, setvarmissval) def enquote(self, item): "Return quote characters around item if the variable is string; otherwise item" q= self.__vartype == "STRING" and "'" or " " return q + str(item) + q def _cloak(self, cmds): """wrap a list of commands into text suitable as an attribute, when surrounded by double quotes.""" for i in range(len(cmds)): cmds[i] = smartquote(cmds[i]) # makes each item in the list safe return smartquote(",".join(cmds)) # makes the entire list safe as a string def _exprcode(self, exp, cond): """iterate over expressions and conditions, which may be lists or singletons, returning appropriate blocks of code. If there are no conditions, then the result is a single unconditional statement. Otherwise it is a sequence of DO IF, ELSE IF ... [ELSE]. If condition is exhausted while expression is not empty, the (single) extra expression goes with else. """ if not isinstance(exp, list): exp = [exp] if not isinstance(cond, list): cond = [cond] condKt = len(filter(None,cond)) expsyntax = [] for (index, express) in enumerate(exp): if index < condKt: expsyntax.append((index == 0 and "DO IF " or "ELSE IF ") + cond[index] + ".") elif index > 0 and index == condKt: # last condition expsyntax.append("ELSE.") elif index > condKt: raise self.Error("Invalid transform: too few conditions") expsyntax.append(self.calculate(express)) if condKt > 0: expsyntax.append("END IF.") return expsyntax def generate(self,submit=True): """Create and, if submit is True run SPSS transformation code, . Returns a list of the commands generated. """ cmds=[] if not (self.varname and bool(self.expression)^isinstance(self, Metadata)): raise self.Error("Invalid transform:incomplete specification or expression with Metadata") if not isinstance(self, Metadata): if self.vartype == "STRING": if not self.__varformat[0].lower() =='a': raise self.Error("Invalid transform: string format required") cmds.append("STRING " + self.varname + "(" + self.varformat + ").") self.trfsyntax = Transform._exprcode(self, self.expression, self.condition) cmds.extend(self.trfsyntax) # cmds.append("COMPUTE " + self.varname + "=" + self.expression + ".") elif self.retransformable: raise self.Error("Metadata objects cannot be retransformable") if self.varlabel: cmds.append("VARIABLE LABEL " + self.varname + " " + smartquote(self.varlabel) + ".") if self.varmeaslvl: if not (self.__vartype == "String" and self.__varmeaslvl == "Scale"): cmds.append("VARIABLE LEVEL " + self.varname + " (" + self.varmeaslvl + ").") if self.varformat and self.__vartype != "String": cmds.append("FORMAT " + self.varname + " (" + self.varformat + ").") if self.__varmissval: cmds.append("MISSING VALUES " + self.varname + " ("\ + " ".join(map(lambda x:self.__vartype == "STRING" and smartquote(str(x)) or str(x),\ self.__varmissval)) + ").") if self.retransformable: cmds.append("VARIABLE ATTRIBUTE VARIABLES= " + self.varname +\ " ATTRIBUTE=$TR.Formula(" + self._cloak(self.trfsyntax) + ").") elif not isinstance(self, Metadata): cmds.append("VARIABLE ATTRIBUTE VARIABLES= " + self.varname +\ " DELETE=$TR.Formula.") if self.varattrib: attrcmd ="VARIABLE ATTRIBUTE VARIABLES= " + self.varname +\ " ATTRIBUTE=\n" attrlist = [] try: for key in self.varattrib: attrlist.append(key +"(" + smartquote(self.varattrib[key])+")") cmds.append(attrcmd + "\n".join(attrlist) + ".") except: raise self.Error("Attribute list must be a dictionary") if submit: spss.Submit(cmds) return cmds class Compute(Transform): """The compute transformation. The expression must be the right hand side of a COMPUTE command. Example: newvar = Transform.Compute(varname="average_increase", varlabel="Salary increase per month of experience if at least a year", varmeaslvl="Scale", varmissval=[999,998,997], varformat="F8.4") newvar.expression = "(salary-salbegin)/jobtime" newvar.condition = "jobtime > 12" newvar.retransformable=True newvar.generate() # Get exception if compute fails Transform.timestamp("average_increase") If retransformable was true, the following code will update the values. import spss, Transform try: Transform.retransform("average_increase") Transform.timestamp("average_increase") except: print "Could not update average_increase." else: spss.Submit("display dictionary" + "/variable=average_increase.")""" def __init__(self, **args): args = super(Compute, self).__init__(**args) #parent removes parms it recognizes if len(args): raise self.Error, "unrecognized parameter(s)" + ", ".join(args.keys()) def calculate(self, expression): return "COMPUTE " + self.varname + "=" + expression + "." class Recode(Transform): """The RECODE transformation command. The expression must be the RECODE specification of a RECODE command omitting the input and output variable names. The RECODE uses the INTO form, where the input and output variables are different. Only one variable at a time can be recoded. The expression consists of an input variable or list of input variables followed by one or more parenthesized items of the form (oldvaluelist=newvalue). Special keywords ELSE, CONVERT**, LOWEST*, HIGHEST*, THRU*, MISSING*, and SYSMIS* can be used in the input list and COPY can be used in the output list. * = numeric variables only. ** = string variables only. String values must be quoted according to SPSS conventions. inputvar = varname identifies the source variable or variables. """ def __init__(self, **args): args = super(Recode, self).__init__(**args) self.inputvar = args.pop("inputvar", None) if len(args): raise self.Error, "unrecognized parameter(s)" + ", ".join(args.keys()) def calculate(self, expression): if not self.__dict__.get("inputvar"): raise self.Error("Invalid recode: missing inputvar name") return "RECODE " + self.inputvar + " " + expression + " INTO " + self.varname + "." class Count(Transform): """The COUNT transformation command, which creates a casewise variable that counts the occurrence of the same value or list of values across a list of variables. The expression must be a variable list followed by a value list in parentheses. For example, x y z (1 2). String variable values must be quoted according to the usual SPSS conventions. Numeric variables can use the special keywords LOWEST, HIGHEST, THRU, MISSING, and SYSMIS. Example: tcount = Transform.Count(varname="newvar", expression="hlth1 hlth2 hlth3(1)") tcount.generate()""" def __init__(self, **args): args = super(Count, self).__init__(**args) if len(args): raise self.Error, "unrecognized parameter(s)" + ", ".join(args.keys()) def calculate(self, expression): return "COUNT " + self.varname + "=" + expression + "." class Metadata(Transform): r"""This class allows the creation of any of the metadata supported by Transform (other than the retransformable attribute) for an existing variable. No expression or condition list is used, and the type cannot be specified.""" def __init__(self, **args): args = super(Metadata, self).__init__(**args) if len(args): raise self.Error, "unrecognized parameter(s)" + ", ".join(args.keys()) def retransformable(varname): """returns a two-tuple for varname. The first item is a Boolean indicating whether variable varname has a formula attribute and can, therefore, be recalculated (assuming that all the necessary inputs are available). The second item is an attributes dictionary for varname suitable for passing to retransform (which will otherwise refetch it). The dictionary may be empty""" attrdict = spssaux.GetAttributesDict(varname) return (attrdict.has_key("$TR.Formula"), attrdict) #standalone retransform def retransform(varname, submit=True, varattrib=None): """repeat a transformation on an existing variable that was created as retransformable. Only the formula is reapplied. If submit is True, the computational commands are run. The list of commands is returned. If varattrib is supplied, it is used as the attribute dictionary; otherwise it is fetched from SPSS. An exception is raised if the transformation is not repeatable or the transformation is malformed. """ if not varattrib: varattrib = spssaux.GetAttributesDict(varname) cmds=[] s = varattrib.get("$TR.Formula", "") + ' ' start = 0 while 1: # search nongreedily for text surrounded by isolated quotes. # s must not end with a quote (kluge) mobj = re.search(r'"([^"].+?[^"])"[^"]',s[start:]) if mobj: cmds.append(mobj.group(1)) start += mobj.end() else: break for i in range(len(cmds)): # unquote command contents as necessary cmds[i] = re.sub('""', '"', cmds[i]) if not cmds: raise Transform.Error("bad or nonrepeatable transformation") if submit: spss.Submit(cmds) return cmds def timestamp(varname): """Set the transformation timestamp attribute. This is intended to allow a user to determine when a retransformable variable was last updated, but it is up to the caller to determine when to apply this stamp. It could be used for other purposes as well.""" spss.Submit("VARIABLE ATTRIBUTE VARIABLES=" + varname + " ATTRIBUTE=$TR.timestamp('"\ + time.strftime('%Y-%m-%d %H:%M:%S') + "').")