Skip to content

Multi-level outline numbering incorrectly rendered as single-level in DOCX #2758

@atok666

Description

@atok666

Bug

When parsing DOCX documents that use Word's multi-level outline numbering (e.g., "3.1", "3.2"), Docling renders the numbering as single-level only (e.g., "1", "2", "3"), losing the hierarchical structure.

Expected: "3.1 Introduction", "3.2 Solution Overview", "3.3 Requirements"
Actual: "1 Introduction", "2 Solution Overview", "3 Requirements"

Steps to reproduce

from docx import Document
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
from pathlib import Path

# Create document with multi-level outline numbering
doc = Document()
numbering_part = doc.part.numbering_part
numbering_xml = numbering_part._element

# Create abstract numbering definition
abstractNum = OxmlElement('w:abstractNum')
abstractNum.set(qn('w:abstractNumId'), '1')

# Level 0 (Chapter - "3")
lvl0 = OxmlElement('w:lvl')
lvl0.set(qn('w:ilvl'), '0')
start0 = OxmlElement('w:start')
start0.set(qn('w:val'), '3')
lvl0.append(start0)
numFmt0 = OxmlElement('w:numFmt')
numFmt0.set(qn('w:val'), 'decimal')
lvl0.append(numFmt0)
lvlText0 = OxmlElement('w:lvlText')
lvlText0.set(qn('w:val'), '%1')
lvl0.append(lvlText0)
abstractNum.append(lvl0)

# Level 1 (Section - "3.1", "3.2")
lvl1 = OxmlElement('w:lvl')
lvl1.set(qn('w:ilvl'), '1')
start1 = OxmlElement('w:start')
start1.set(qn('w:val'), '1')
lvl1.append(start1)
numFmt1 = OxmlElement('w:numFmt')
numFmt1.set(qn('w:val'), 'decimal')
lvl1.append(numFmt1)
lvlText1 = OxmlElement('w:lvlText')
lvlText1.set(qn('w:val'), '%1.%2')
lvl1.append(lvlText1)
abstractNum.append(lvl1)

numbering_xml.insert(0, abstractNum)

num = OxmlElement('w:num')
num.set(qn('w:numId'), '1')
abstractNumId = OxmlElement('w:abstractNumId')
abstractNumId.set(qn('w:val'), '1')
num.append(abstractNumId)
numbering_xml.append(num)

def add_numbered_heading(doc, text, level, num_id):
    p = doc.add_paragraph()
    p.style = f'Heading {level}'
    pPr = p._p.get_or_add_pPr()
    numPr = OxmlElement('w:numPr')
    ilvl = OxmlElement('w:ilvl')
    ilvl.set(qn('w:val'), str(level - 1))
    numPr.append(ilvl)
    numId_elem = OxmlElement('w:numId')
    numId_elem.set(qn('w:val'), str(num_id))
    numPr.append(numId_elem)
    pPr.append(numPr)
    run = p.add_run(text)
    return p

add_numbered_heading(doc, "Introduction", 2, 1)
doc.add_paragraph("Content under 3.1")
add_numbered_heading(doc, "Solution Overview", 2, 1)
doc.add_paragraph("Content under 3.2")
add_numbered_heading(doc, "Requirements", 2, 1)
doc.add_paragraph("Content under 3.3")

doc.save("/tmp/test_outline_numbering.docx")

# Now parse with Docling
from docling.document_converter import DocumentConverter

converter = DocumentConverter()
result = converter.convert("/tmp/test_outline_numbering.docx")

for item in result.document.iterate_items():
    element = item[0] if isinstance(item, tuple) else item
    label = element.label.value if hasattr(element.label, 'value') else str(element.label)
    text = element.text if hasattr(element, 'text') else ""
    print(f"{label}: {text}")

Output:
section_header: 1 Introduction
text: Content under 3.1
section_header: 2 Solution Overview
text: Content under 3.2
section_header: 3 Requirements
text: Content under 3.3

Docling version

2.64.0

Python version

Python 3.11.14

Metadata

Metadata

Assignees

Labels

bugSomething isn't workingdocxissue related to docx backendgood first issueIssues and pull requests for new contributors

Type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions