xlsx.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802
  1. ##
  2. # Portions copyright (c) 2008-2012 Stephen John Machin, Lingfo Pty Ltd
  3. # This module is part of the xlrd package, which is released under a BSD-style licence.
  4. ##
  5. from __future__ import print_function, unicode_literals
  6. DEBUG = 0
  7. import sys
  8. import re
  9. from .timemachine import *
  10. from .book import Book, Name
  11. from .biffh import error_text_from_code, XLRDError, XL_CELL_BLANK, XL_CELL_TEXT, XL_CELL_BOOLEAN, XL_CELL_ERROR
  12. from .formatting import is_date_format_string, Format, XF
  13. from .sheet import Sheet
  14. DLF = sys.stdout # Default Log File
  15. ET = None
  16. ET_has_iterparse = False
  17. def ensure_elementtree_imported(verbosity, logfile):
  18. global ET, ET_has_iterparse
  19. if ET is not None:
  20. return
  21. if "IronPython" in sys.version:
  22. import xml.etree.ElementTree as ET
  23. #### 2.7.2.1: fails later with
  24. #### NotImplementedError: iterparse is not supported on IronPython. (CP #31923)
  25. else:
  26. try: import xml.etree.cElementTree as ET
  27. except ImportError:
  28. try: import cElementTree as ET
  29. except ImportError:
  30. try: import lxml.etree as ET
  31. except ImportError:
  32. try: import xml.etree.ElementTree as ET
  33. except ImportError:
  34. try: import elementtree.ElementTree as ET
  35. except ImportError:
  36. raise Exception("Failed to import an ElementTree implementation")
  37. if hasattr(ET, 'iterparse'):
  38. _dummy_stream = BYTES_IO(b'')
  39. try:
  40. ET.iterparse(_dummy_stream)
  41. ET_has_iterparse = True
  42. except NotImplementedError:
  43. pass
  44. if verbosity:
  45. etree_version = repr([
  46. (item, getattr(ET, item))
  47. for item in ET.__dict__.keys()
  48. if item.lower().replace('_', '') == 'version'
  49. ])
  50. print(ET.__file__, ET.__name__, etree_version, ET_has_iterparse, file=logfile)
  51. def split_tag(tag):
  52. pos = tag.rfind('}') + 1
  53. if pos >= 2:
  54. return tag[:pos], tag[pos:]
  55. return '', tag
  56. def augment_keys(adict, uri):
  57. # uri must already be enclosed in {}
  58. for x in list(adict.keys()):
  59. adict[uri + x] = adict[x]
  60. _UPPERCASE_1_REL_INDEX = {} # Used in fast conversion of column names (e.g. "XFD") to indices (16383)
  61. for _x in xrange(26):
  62. _UPPERCASE_1_REL_INDEX["ABCDEFGHIJKLMNOPQRSTUVWXYZ"[_x]] = _x + 1
  63. for _x in "123456789":
  64. _UPPERCASE_1_REL_INDEX[_x] = 0
  65. del _x
  66. def cell_name_to_rowx_colx(cell_name, letter_value=_UPPERCASE_1_REL_INDEX):
  67. # Extract column index from cell name
  68. # A<row number> => 0, Z =>25, AA => 26, XFD => 16383
  69. colx = 0
  70. charx = -1
  71. try:
  72. for c in cell_name:
  73. charx += 1
  74. lv = letter_value[c]
  75. if lv:
  76. colx = colx * 26 + lv
  77. else: # start of row number; can't be '0'
  78. colx = colx - 1
  79. assert 0 <= colx < X12_MAX_COLS
  80. break
  81. except KeyError:
  82. raise Exception('Unexpected character %r in cell name %r' % (c, cell_name))
  83. rowx = int(cell_name[charx:]) - 1
  84. return rowx, colx
  85. error_code_from_text = {}
  86. for _code, _text in error_text_from_code.items():
  87. error_code_from_text[_text] = _code
  88. # === X12 === Excel 2007 .xlsx ===============================================
  89. U_SSML12 = "{http://schemas.openxmlformats.org/spreadsheetml/2006/main}"
  90. U_ODREL = "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}"
  91. U_PKGREL = "{http://schemas.openxmlformats.org/package/2006/relationships}"
  92. U_CP = "{http://schemas.openxmlformats.org/package/2006/metadata/core-properties}"
  93. U_DC = "{http://purl.org/dc/elements/1.1/}"
  94. U_DCTERMS = "{http://purl.org/dc/terms/}"
  95. XML_SPACE_ATTR = "{http://www.w3.org/XML/1998/namespace}space"
  96. XML_WHITESPACE = "\t\n \r"
  97. X12_MAX_ROWS = 2 ** 20
  98. X12_MAX_COLS = 2 ** 14
  99. V_TAG = U_SSML12 + 'v' # cell child: value
  100. F_TAG = U_SSML12 + 'f' # cell child: formula
  101. IS_TAG = U_SSML12 + 'is' # cell child: inline string
  102. def unescape(s,
  103. subber=re.compile(r'_x[0-9A-Fa-f]{4,4}_', re.UNICODE).sub,
  104. repl=lambda mobj: unichr(int(mobj.group(0)[2:6], 16)),
  105. ):
  106. if "_" in s:
  107. return subber(repl, s)
  108. return s
  109. def cooked_text(self, elem):
  110. t = elem.text
  111. if t is None:
  112. return ''
  113. if elem.get(XML_SPACE_ATTR) != 'preserve':
  114. t = t.strip(XML_WHITESPACE)
  115. return ensure_unicode(unescape(t))
  116. def get_text_from_si_or_is(self, elem, r_tag=U_SSML12+'r', t_tag=U_SSML12 +'t'):
  117. "Returns unescaped unicode"
  118. accum = []
  119. for child in elem:
  120. # self.dump_elem(child)
  121. tag = child.tag
  122. if tag == t_tag:
  123. t = cooked_text(self, child)
  124. if t: # note: .text attribute can be None
  125. accum.append(t)
  126. elif tag == r_tag:
  127. for tnode in child:
  128. if tnode.tag == t_tag:
  129. t = cooked_text(self, tnode)
  130. if t:
  131. accum.append(t)
  132. return ''.join(accum)
  133. def map_attributes(amap, elem, obj):
  134. for xml_attr, obj_attr, cnv_func_or_const in amap:
  135. if not xml_attr:
  136. setattr(obj, obj_attr, cnv_func_or_const)
  137. continue
  138. if not obj_attr: continue #### FIX ME ####
  139. raw_value = elem.get(xml_attr)
  140. cooked_value = cnv_func_or_const(raw_value)
  141. setattr(obj, obj_attr, cooked_value)
  142. def cnv_ST_Xstring(s):
  143. if s is None: return ""
  144. return ensure_unicode(s)
  145. def cnv_xsd_unsignedInt(s):
  146. if not s:
  147. return None
  148. value = int(s)
  149. assert value >= 0
  150. return value
  151. def cnv_xsd_boolean(s):
  152. if not s:
  153. return 0
  154. if s in ("1", "true", "on"):
  155. return 1
  156. if s in ("0", "false", "off"):
  157. return 0
  158. raise ValueError("unexpected xsd:boolean value: %r" % s)
  159. _defined_name_attribute_map = (
  160. ("name", "name", cnv_ST_Xstring, ),
  161. ("comment", "", cnv_ST_Xstring, ),
  162. ("customMenu", "", cnv_ST_Xstring, ),
  163. ("description", "", cnv_ST_Xstring, ),
  164. ("help", "", cnv_ST_Xstring, ),
  165. ("statusBar", "", cnv_ST_Xstring, ),
  166. ("localSheetId", "scope", cnv_xsd_unsignedInt, ),
  167. ("hidden", "hidden", cnv_xsd_boolean, ),
  168. ("function", "func", cnv_xsd_boolean, ),
  169. ("vbProcedure", "vbasic", cnv_xsd_boolean, ),
  170. ("xlm", "macro", cnv_xsd_boolean, ),
  171. ("functionGroupId", "funcgroup", cnv_xsd_unsignedInt, ),
  172. ("shortcutKey", "", cnv_ST_Xstring, ),
  173. ("publishToServer", "", cnv_xsd_boolean, ),
  174. ("workbookParameter", "", cnv_xsd_boolean, ),
  175. ("", "any_err", 0, ),
  176. ("", "any_external", 0, ),
  177. ("", "any_rel", 0, ),
  178. ("", "basic_formula_len", 0, ),
  179. ("", "binary", 0, ),
  180. ("", "builtin", 0, ),
  181. ("", "complex", 0, ),
  182. ("", "evaluated", 0, ),
  183. ("", "excel_sheet_index", 0, ),
  184. ("", "excel_sheet_num", 0, ),
  185. ("", "option_flags", 0, ),
  186. ("", "result", None, ),
  187. ("", "stack", None, ),
  188. )
  189. def make_name_access_maps(bk):
  190. name_and_scope_map = {} # (name.lower(), scope): Name_object
  191. name_map = {} # name.lower() : list of Name_objects (sorted in scope order)
  192. num_names = len(bk.name_obj_list)
  193. for namex in xrange(num_names):
  194. nobj = bk.name_obj_list[namex]
  195. name_lcase = nobj.name.lower()
  196. key = (name_lcase, nobj.scope)
  197. if key in name_and_scope_map:
  198. msg = 'Duplicate entry %r in name_and_scope_map' % (key, )
  199. if 0:
  200. raise XLRDError(msg)
  201. else:
  202. if bk.verbosity:
  203. print(msg, file=bk.logfile)
  204. name_and_scope_map[key] = nobj
  205. if name_lcase in name_map:
  206. name_map[name_lcase].append((nobj.scope, nobj))
  207. else:
  208. name_map[name_lcase] = [(nobj.scope, nobj)]
  209. for key in name_map.keys():
  210. alist = name_map[key]
  211. alist.sort()
  212. name_map[key] = [x[1] for x in alist]
  213. bk.name_and_scope_map = name_and_scope_map
  214. bk.name_map = name_map
  215. class X12General(object):
  216. def process_stream(self, stream, heading=None):
  217. if self.verbosity >= 2 and heading is not None:
  218. fprintf(self.logfile, "\n=== %s ===\n", heading)
  219. self.tree = ET.parse(stream)
  220. getmethod = self.tag2meth.get
  221. for elem in self.tree.getiterator():
  222. if self.verbosity >= 3:
  223. self.dump_elem(elem)
  224. meth = getmethod(elem.tag)
  225. if meth:
  226. meth(self, elem)
  227. self.finish_off()
  228. def finish_off(self):
  229. pass
  230. def dump_elem(self, elem):
  231. fprintf(self.logfile,
  232. "===\ntag=%r len=%d attrib=%r text=%r tail=%r\n",
  233. split_tag(elem.tag)[1], len(elem), elem.attrib, elem.text, elem.tail)
  234. def dumpout(self, fmt, *vargs):
  235. text = (12 * ' ' + fmt + '\n') % vargs
  236. self.logfile.write(text)
  237. class X12Book(X12General):
  238. def __init__(self, bk, logfile=DLF, verbosity=False):
  239. self.bk = bk
  240. self.logfile = logfile
  241. self.verbosity = verbosity
  242. self.bk.nsheets = 0
  243. self.bk.props = {}
  244. self.relid2path = {}
  245. self.relid2reltype = {}
  246. self.sheet_targets = [] # indexed by sheetx
  247. self.sheetIds = [] # indexed by sheetx
  248. core_props_menu = {
  249. U_CP+"lastModifiedBy": ("last_modified_by", cnv_ST_Xstring),
  250. U_DC+"creator": ("creator", cnv_ST_Xstring),
  251. U_DCTERMS+"modified": ("modified", cnv_ST_Xstring),
  252. U_DCTERMS+"created": ("created", cnv_ST_Xstring),
  253. }
  254. def process_coreprops(self, stream):
  255. if self.verbosity >= 2:
  256. fprintf(self.logfile, "\n=== coreProps ===\n")
  257. self.tree = ET.parse(stream)
  258. getmenu = self.core_props_menu.get
  259. props = {}
  260. for elem in self.tree.getiterator():
  261. if self.verbosity >= 3:
  262. self.dump_elem(elem)
  263. menu = getmenu(elem.tag)
  264. if menu:
  265. attr, func = menu
  266. value = func(elem.text)
  267. props[attr] = value
  268. self.bk.user_name = props.get('last_modified_by') or props.get('creator')
  269. self.bk.props = props
  270. if self.verbosity >= 2:
  271. fprintf(self.logfile, "props: %r\n", props)
  272. self.finish_off()
  273. def process_rels(self, stream):
  274. if self.verbosity >= 2:
  275. fprintf(self.logfile, "\n=== Relationships ===\n")
  276. tree = ET.parse(stream)
  277. r_tag = U_PKGREL + 'Relationship'
  278. for elem in tree.findall(r_tag):
  279. rid = elem.get('Id')
  280. target = elem.get('Target')
  281. reltype = elem.get('Type').split('/')[-1]
  282. if self.verbosity >= 2:
  283. self.dumpout('Id=%r Type=%r Target=%r', rid, reltype, target)
  284. self.relid2reltype[rid] = reltype
  285. # self.relid2path[rid] = 'xl/' + target
  286. if target.startswith('/'):
  287. self.relid2path[rid] = target[1:] # drop the /
  288. else:
  289. self.relid2path[rid] = 'xl/' + target
  290. def do_defined_name(self, elem):
  291. #### UNDER CONSTRUCTION ####
  292. if 0 and self.verbosity >= 3:
  293. self.dump_elem(elem)
  294. nobj = Name()
  295. bk = self.bk
  296. nobj.bk = bk
  297. nobj.name_index = len(bk.name_obj_list)
  298. bk.name_obj_list.append(nobj)
  299. nobj.name = elem.get('name')
  300. nobj.raw_formula = None # compiled bytecode formula -- not in XLSX
  301. nobj.formula_text = cooked_text(self, elem)
  302. map_attributes(_defined_name_attribute_map, elem, nobj)
  303. if nobj.scope is None:
  304. nobj.scope = -1 # global
  305. if nobj.name.startswith("_xlnm."):
  306. nobj.builtin = 1
  307. if self.verbosity >= 2:
  308. nobj.dump(header='=== Name object ===')
  309. def do_defined_names(self, elem):
  310. for child in elem:
  311. self.do_defined_name(child)
  312. make_name_access_maps(self.bk)
  313. def do_sheet(self, elem):
  314. bk = self.bk
  315. sheetx = bk.nsheets
  316. # print elem.attrib
  317. rid = elem.get(U_ODREL + 'id')
  318. sheetId = int(elem.get('sheetId'))
  319. name = unescape(ensure_unicode(elem.get('name')))
  320. reltype = self.relid2reltype[rid]
  321. target = self.relid2path[rid]
  322. if self.verbosity >= 2:
  323. self.dumpout(
  324. 'sheetx=%d sheetId=%r rid=%r type=%r name=%r',
  325. sheetx, sheetId, rid, reltype, name)
  326. if reltype != 'worksheet':
  327. if self.verbosity >= 2:
  328. self.dumpout('Ignoring sheet of type %r (name=%r)', reltype, name)
  329. return
  330. state = elem.get('state')
  331. visibility_map = {
  332. None: 0,
  333. 'visible': 0,
  334. 'hidden': 1,
  335. 'veryHidden': 2
  336. }
  337. bk._sheet_visibility.append(visibility_map[state])
  338. sheet = Sheet(bk, position=None, name=name, number=sheetx)
  339. sheet.utter_max_rows = X12_MAX_ROWS
  340. sheet.utter_max_cols = X12_MAX_COLS
  341. bk._sheet_list.append(sheet)
  342. bk._sheet_names.append(name)
  343. bk.nsheets += 1
  344. self.sheet_targets.append(target)
  345. self.sheetIds.append(sheetId)
  346. def do_workbookpr(self, elem):
  347. datemode = cnv_xsd_boolean(elem.get('date1904'))
  348. if self.verbosity >= 2:
  349. self.dumpout('datemode=%r', datemode)
  350. self.bk.datemode = datemode
  351. tag2meth = {
  352. 'definedNames': do_defined_names,
  353. 'workbookPr': do_workbookpr,
  354. 'sheet': do_sheet,
  355. }
  356. augment_keys(tag2meth, U_SSML12)
  357. class X12SST(X12General):
  358. def __init__(self, bk, logfile=DLF, verbosity=0):
  359. self.bk = bk
  360. self.logfile = logfile
  361. self.verbosity = verbosity
  362. if ET_has_iterparse:
  363. self.process_stream = self.process_stream_iterparse
  364. else:
  365. self.process_stream = self.process_stream_findall
  366. def process_stream_iterparse(self, stream, heading=None):
  367. if self.verbosity >= 2 and heading is not None:
  368. fprintf(self.logfile, "\n=== %s ===\n", heading)
  369. si_tag = U_SSML12 + 'si'
  370. elemno = -1
  371. sst = self.bk._sharedstrings
  372. for event, elem in ET.iterparse(stream):
  373. if elem.tag != si_tag: continue
  374. elemno = elemno + 1
  375. if self.verbosity >= 3:
  376. fprintf(self.logfile, "element #%d\n", elemno)
  377. self.dump_elem(elem)
  378. result = get_text_from_si_or_is(self, elem)
  379. sst.append(result)
  380. elem.clear() # destroy all child elements
  381. if self.verbosity >= 2:
  382. self.dumpout('Entries in SST: %d', len(sst))
  383. if self.verbosity >= 3:
  384. for x, s in enumerate(sst):
  385. fprintf(self.logfile, "SST x=%d s=%r\n", x, s)
  386. def process_stream_findall(self, stream, heading=None):
  387. if self.verbosity >= 2 and heading is not None:
  388. fprintf(self.logfile, "\n=== %s ===\n", heading)
  389. self.tree = ET.parse(stream)
  390. si_tag = U_SSML12 + 'si'
  391. elemno = -1
  392. sst = self.bk._sharedstrings
  393. for elem in self.tree.findall(si_tag):
  394. elemno = elemno + 1
  395. if self.verbosity >= 3:
  396. fprintf(self.logfile, "element #%d\n", elemno)
  397. self.dump_elem(elem)
  398. result = get_text_from_si_or_is(self, elem)
  399. sst.append(result)
  400. if self.verbosity >= 2:
  401. self.dumpout('Entries in SST: %d', len(sst))
  402. class X12Styles(X12General):
  403. def __init__(self, bk, logfile=DLF, verbosity=0):
  404. self.bk = bk
  405. self.logfile = logfile
  406. self.verbosity = verbosity
  407. self.xf_counts = [0, 0]
  408. self.xf_type = None
  409. self.fmt_is_date = {}
  410. for x in list(range(14, 23)) + list(range(45, 48)): #### hard-coding FIX ME ####
  411. self.fmt_is_date[x] = 1
  412. # dummy entry for XF 0 in case no Styles section
  413. self.bk._xf_index_to_xl_type_map[0] = 2
  414. # fill_in_standard_formats(bk) #### pre-integration kludge
  415. def do_cellstylexfs(self, elem):
  416. self.xf_type = 0
  417. def do_cellxfs(self, elem):
  418. self.xf_type = 1
  419. def do_numfmt(self, elem):
  420. formatCode = ensure_unicode(elem.get('formatCode'))
  421. numFmtId = int(elem.get('numFmtId'))
  422. is_date = is_date_format_string(self.bk, formatCode)
  423. self.fmt_is_date[numFmtId] = is_date
  424. fmt_obj = Format(numFmtId, is_date + 2, formatCode)
  425. self.bk.format_map[numFmtId] = fmt_obj
  426. if self.verbosity >= 3:
  427. self.dumpout('numFmtId=%d formatCode=%r is_date=%d', numFmtId, formatCode, is_date)
  428. def do_xf(self, elem):
  429. if self.xf_type != 1:
  430. #### ignoring style XFs for the moment
  431. return
  432. xfx = self.xf_counts[self.xf_type]
  433. self.xf_counts[self.xf_type] = xfx + 1
  434. xf = XF()
  435. self.bk.xf_list.append(xf)
  436. self.bk.xfcount += 1
  437. numFmtId = int(elem.get('numFmtId', '0'))
  438. xf.format_key = numFmtId
  439. is_date = self.fmt_is_date.get(numFmtId, 0)
  440. self.bk._xf_index_to_xl_type_map[xfx] = is_date + 2
  441. if self.verbosity >= 3:
  442. self.dumpout(
  443. 'xfx=%d numFmtId=%d',
  444. xfx, numFmtId,
  445. )
  446. self.dumpout(repr(self.bk._xf_index_to_xl_type_map))
  447. tag2meth = {
  448. 'cellStyleXfs': do_cellstylexfs,
  449. 'cellXfs': do_cellxfs,
  450. 'numFmt': do_numfmt,
  451. 'xf': do_xf,
  452. }
  453. augment_keys(tag2meth, U_SSML12)
  454. class X12Sheet(X12General):
  455. def __init__(self, sheet, logfile=DLF, verbosity=0):
  456. self.sheet = sheet
  457. self.logfile = logfile
  458. self.verbosity = verbosity
  459. self.rowx = -1 # We may need to count them.
  460. self.bk = sheet.book
  461. self.sst = self.bk._sharedstrings
  462. self.merged_cells = sheet.merged_cells
  463. self.warned_no_cell_name = 0
  464. self.warned_no_row_num = 0
  465. if ET_has_iterparse:
  466. self.process_stream = self.own_process_stream
  467. def own_process_stream(self, stream, heading=None):
  468. if self.verbosity >= 2 and heading is not None:
  469. fprintf(self.logfile, "\n=== %s ===\n", heading)
  470. getmethod = self.tag2meth.get
  471. row_tag = U_SSML12 + "row"
  472. self_do_row = self.do_row
  473. for event, elem in ET.iterparse(stream):
  474. if elem.tag == row_tag:
  475. self_do_row(elem)
  476. elem.clear() # destroy all child elements (cells)
  477. elif elem.tag == U_SSML12 + "dimension":
  478. self.do_dimension(elem)
  479. elif elem.tag == U_SSML12 + "mergeCell":
  480. self.do_merge_cell(elem)
  481. self.finish_off()
  482. def process_comments_stream(self, stream):
  483. root = ET.parse(stream).getroot()
  484. author_list = root[0]
  485. assert author_list.tag == U_SSML12 + 'authors'
  486. authors = [elem.text for elem in author_list]
  487. comment_list = root[1]
  488. assert comment_list.tag == U_SSML12 + 'commentList'
  489. cell_note_map = self.sheet.cell_note_map
  490. from .sheet import Note
  491. text_tag = U_SSML12 + 'text'
  492. r_tag = U_SSML12 + 'r'
  493. t_tag = U_SSML12 + 't'
  494. for elem in comment_list.findall(U_SSML12 + 'comment'):
  495. ts = elem.findall('./' + text_tag + '/' + t_tag)
  496. ts += elem.findall('./' + text_tag + '/' + r_tag + '/' + t_tag)
  497. ref = elem.get('ref')
  498. note = Note()
  499. note.author = authors[int(elem.get('authorId'))]
  500. note.rowx, note.colx = coords = cell_name_to_rowx_colx(ref)
  501. note.text = ''
  502. for t in ts:
  503. note.text += cooked_text(self, t)
  504. cell_note_map[coords] = note
  505. def do_dimension(self, elem):
  506. ref = elem.get('ref') # example: "A1:Z99" or just "A1"
  507. if ref:
  508. # print >> self.logfile, "dimension: ref=%r" % ref
  509. last_cell_ref = ref.split(':')[-1] # example: "Z99"
  510. rowx, colx = cell_name_to_rowx_colx(last_cell_ref)
  511. self.sheet._dimnrows = rowx + 1
  512. self.sheet._dimncols = colx + 1
  513. def do_merge_cell(self, elem):
  514. # The ref attribute should be a cell range like "B1:D5".
  515. ref = elem.get('ref')
  516. if ref:
  517. first_cell_ref, last_cell_ref = ref.split(':')
  518. first_rowx, first_colx = cell_name_to_rowx_colx(first_cell_ref)
  519. last_rowx, last_colx = cell_name_to_rowx_colx(last_cell_ref)
  520. self.merged_cells.append((first_rowx, last_rowx + 1,
  521. first_colx, last_colx + 1))
  522. def do_row(self, row_elem):
  523. def bad_child_tag(child_tag):
  524. raise Exception('cell type %s has unexpected child <%s> at rowx=%r colx=%r' % (cell_type, child_tag, rowx, colx))
  525. row_number = row_elem.get('r')
  526. if row_number is None: # Yes, it's optional.
  527. self.rowx += 1
  528. explicit_row_number = 0
  529. if self.verbosity and not self.warned_no_row_num:
  530. self.dumpout("no row number; assuming rowx=%d", self.rowx)
  531. self.warned_no_row_num = 1
  532. else:
  533. self.rowx = int(row_number) - 1
  534. explicit_row_number = 1
  535. assert 0 <= self.rowx < X12_MAX_ROWS
  536. rowx = self.rowx
  537. colx = -1
  538. if self.verbosity >= 3:
  539. self.dumpout("<row> row_number=%r rowx=%d explicit=%d",
  540. row_number, self.rowx, explicit_row_number)
  541. letter_value = _UPPERCASE_1_REL_INDEX
  542. for cell_elem in row_elem:
  543. cell_name = cell_elem.get('r')
  544. if cell_name is None: # Yes, it's optional.
  545. colx += 1
  546. if self.verbosity and not self.warned_no_cell_name:
  547. self.dumpout("no cellname; assuming rowx=%d colx=%d", rowx, colx)
  548. self.warned_no_cell_name = 1
  549. else:
  550. # Extract column index from cell name
  551. # A<row number> => 0, Z =>25, AA => 26, XFD => 16383
  552. colx = 0
  553. charx = -1
  554. try:
  555. for c in cell_name:
  556. charx += 1
  557. if c == '$':
  558. continue
  559. lv = letter_value[c]
  560. if lv:
  561. colx = colx * 26 + lv
  562. else: # start of row number; can't be '0'
  563. colx = colx - 1
  564. assert 0 <= colx < X12_MAX_COLS
  565. break
  566. except KeyError:
  567. raise Exception('Unexpected character %r in cell name %r' % (c, cell_name))
  568. if explicit_row_number and cell_name[charx:] != row_number:
  569. raise Exception('cell name %r but row number is %r' % (cell_name, row_number))
  570. xf_index = int(cell_elem.get('s', '0'))
  571. cell_type = cell_elem.get('t', 'n')
  572. tvalue = None
  573. formula = None
  574. if cell_type == 'n':
  575. # n = number. Most frequent type.
  576. # <v> child contains plain text which can go straight into float()
  577. # OR there's no text in which case it's a BLANK cell
  578. for child in cell_elem:
  579. child_tag = child.tag
  580. if child_tag == V_TAG:
  581. tvalue = child.text
  582. elif child_tag == F_TAG:
  583. formula = cooked_text(self, child)
  584. else:
  585. raise Exception('unexpected tag %r' % child_tag)
  586. if not tvalue:
  587. if self.bk.formatting_info:
  588. self.sheet.put_cell(rowx, colx, XL_CELL_BLANK, '', xf_index)
  589. else:
  590. self.sheet.put_cell(rowx, colx, None, float(tvalue), xf_index)
  591. elif cell_type == "s":
  592. # s = index into shared string table. 2nd most frequent type
  593. # <v> child contains plain text which can go straight into int()
  594. for child in cell_elem:
  595. child_tag = child.tag
  596. if child_tag == V_TAG:
  597. tvalue = child.text
  598. elif child_tag == F_TAG:
  599. # formula not expected here, but gnumeric does it.
  600. formula = child.text
  601. else:
  602. bad_child_tag(child_tag)
  603. if not tvalue:
  604. # <c r="A1" t="s"/>
  605. if self.bk.formatting_info:
  606. self.sheet.put_cell(rowx, colx, XL_CELL_BLANK, '', xf_index)
  607. else:
  608. value = self.sst[int(tvalue)]
  609. self.sheet.put_cell(rowx, colx, XL_CELL_TEXT, value, xf_index)
  610. elif cell_type == "str":
  611. # str = string result from formula.
  612. # Should have <f> (formula) child; however in one file, all text cells are str with no formula.
  613. # <v> child can contain escapes
  614. for child in cell_elem:
  615. child_tag = child.tag
  616. if child_tag == V_TAG:
  617. tvalue = cooked_text(self, child)
  618. elif child_tag == F_TAG:
  619. formula = cooked_text(self, child)
  620. else:
  621. bad_child_tag(child_tag)
  622. # assert tvalue is not None and formula is not None
  623. # Yuk. Fails with file created by gnumeric -- no tvalue!
  624. self.sheet.put_cell(rowx, colx, XL_CELL_TEXT, tvalue, xf_index)
  625. elif cell_type == "b":
  626. # b = boolean
  627. # <v> child contains "0" or "1"
  628. # Maybe the data should be converted with cnv_xsd_boolean;
  629. # ECMA standard is silent; Excel 2007 writes 0 or 1
  630. for child in cell_elem:
  631. child_tag = child.tag
  632. if child_tag == V_TAG:
  633. tvalue = child.text
  634. elif child_tag == F_TAG:
  635. formula = cooked_text(self, child)
  636. else:
  637. bad_child_tag(child_tag)
  638. self.sheet.put_cell(rowx, colx, XL_CELL_BOOLEAN, int(tvalue), xf_index)
  639. elif cell_type == "e":
  640. # e = error
  641. # <v> child contains e.g. "#REF!"
  642. for child in cell_elem:
  643. child_tag = child.tag
  644. if child_tag == V_TAG:
  645. tvalue = child.text
  646. elif child_tag == F_TAG:
  647. formula = cooked_text(self, child)
  648. else:
  649. bad_child_tag(child_tag)
  650. value = error_code_from_text[tvalue]
  651. self.sheet.put_cell(rowx, colx, XL_CELL_ERROR, value, xf_index)
  652. elif cell_type == "inlineStr":
  653. # Not expected in files produced by Excel.
  654. # Only possible child is <is>.
  655. # It's a way of allowing 3rd party s/w to write text (including rich text) cells
  656. # without having to build a shared string table
  657. for child in cell_elem:
  658. child_tag = child.tag
  659. if child_tag == IS_TAG:
  660. tvalue = get_text_from_si_or_is(self, child)
  661. else:
  662. bad_child_tag(child_tag)
  663. assert tvalue is not None
  664. self.sheet.put_cell(rowx, colx, XL_CELL_TEXT, tvalue, xf_index)
  665. else:
  666. raise Exception("Unknown cell type %r in rowx=%d colx=%d" % (cell_type, rowx, colx))
  667. tag2meth = {
  668. 'row': do_row,
  669. }
  670. augment_keys(tag2meth, U_SSML12)
  671. def open_workbook_2007_xml(
  672. zf,
  673. component_names,
  674. logfile=sys.stdout,
  675. verbosity=0,
  676. use_mmap=0,
  677. formatting_info=0,
  678. on_demand=0,
  679. ragged_rows=0,
  680. ):
  681. ensure_elementtree_imported(verbosity, logfile)
  682. bk = Book()
  683. bk.logfile = logfile
  684. bk.verbosity = verbosity
  685. bk.formatting_info = formatting_info
  686. if formatting_info:
  687. raise NotImplementedError("formatting_info=True not yet implemented")
  688. bk.use_mmap = False #### Not supported initially
  689. bk.on_demand = on_demand
  690. if on_demand:
  691. if verbosity:
  692. print("WARNING *** on_demand=True not yet implemented; falling back to False", file=bk.logfile)
  693. bk.on_demand = False
  694. bk.ragged_rows = ragged_rows
  695. x12book = X12Book(bk, logfile, verbosity)
  696. zflo = zf.open('xl/_rels/workbook.xml.rels')
  697. x12book.process_rels(zflo)
  698. del zflo
  699. zflo = zf.open('xl/workbook.xml')
  700. x12book.process_stream(zflo, 'Workbook')
  701. del zflo
  702. props_name = 'docProps/core.xml'
  703. if props_name in component_names:
  704. zflo = zf.open(props_name)
  705. x12book.process_coreprops(zflo)
  706. x12sty = X12Styles(bk, logfile, verbosity)
  707. if 'xl/styles.xml' in component_names:
  708. zflo = zf.open('xl/styles.xml')
  709. x12sty.process_stream(zflo, 'styles')
  710. del zflo
  711. else:
  712. # seen in MS sample file MergedCells.xlsx
  713. pass
  714. sst_fname = 'xl/sharedStrings.xml'
  715. x12sst = X12SST(bk, logfile, verbosity)
  716. if sst_fname in component_names:
  717. zflo = zf.open(sst_fname)
  718. x12sst.process_stream(zflo, 'SST')
  719. del zflo
  720. for sheetx in range(bk.nsheets):
  721. fname = x12book.sheet_targets[sheetx]
  722. zflo = zf.open(fname)
  723. sheet = bk._sheet_list[sheetx]
  724. x12sheet = X12Sheet(sheet, logfile, verbosity)
  725. heading = "Sheet %r (sheetx=%d) from %r" % (sheet.name, sheetx, fname)
  726. x12sheet.process_stream(zflo, heading)
  727. del zflo
  728. comments_fname = 'xl/comments%d.xml' % (sheetx + 1)
  729. if comments_fname in component_names:
  730. comments_stream = zf.open(comments_fname)
  731. x12sheet.process_comments_stream(comments_stream)
  732. del comments_stream
  733. sheet.tidy_dimensions()
  734. return bk