_extract_proto.py 1.2 KB

1234567891011121314151617181920212223242526
  1. import zipfile, xml.etree.ElementTree as ET
  2. docx_path = r'D:\我的文件\JeecgBoot-main\doc\湛江市人力资源和社会保障局智慧人社运营运维(2025-2027年)项目需求规格说明书-就业一湛通服务平台.docx'
  3. output_path = r'D:\我的文件\JeecgBoot-main\.docs\_proto_docx.txt'
  4. try:
  5. with zipfile.ZipFile(docx_path) as z:
  6. xml_content = z.read('word/document.xml')
  7. root = ET.fromstring(xml_content)
  8. NS = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
  9. all_text = []
  10. for para in root.iter(NS + 'p'):
  11. texts = []
  12. for t in para.iter(NS + 't'):
  13. if t.text:
  14. texts.append(t.text)
  15. if texts:
  16. all_text.append(''.join(texts))
  17. with open(output_path, 'w', encoding='utf-8') as f:
  18. f.write('\n'.join(all_text))
  19. # Find 方案设计 content
  20. for i, p in enumerate(all_text):
  21. if '方案设计' in p or '页面原型' in p or '原型设计' in p:
  22. for j in range(max(0,i-2), min(len(all_text),i+15)):
  23. print(f'{j}: [{all_text[j][:200]}]')
  24. print('---')
  25. except Exception as e:
  26. print(f'Error: {e}')