data_dealing/ocr_v2.ipynb
2026-02-24 12:23:13 +08:00

2593 lines
184 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 16,
"id": "69e8a645-b801-4e72-a98d-fee2fd904f68",
"metadata": {},
"outputs": [],
"source": [
"# -*- coding: utf-8 -*-\n",
"import os\n",
"import re\n",
"import sys\n",
"import json\n",
"from io import BytesIO\n",
"from pathlib import Path\n",
"from typing import List, Optional, Tuple, Dict, Any, Iterable\n",
"\n",
"import fitz # PyMuPDF\n",
"from PIL import Image, ImageOps\n",
"\n",
"# from docx import Document\n",
"# from docx.oxml.table import CT_Tbl\n",
"# from docx.oxml.text.paragraph import CT_P\n",
"# from docx.table import _Cell, Table\n",
"# from docx.text.paragraph import Paragraph\n",
"from docx import Document as docx_open\n",
"from docx.document import Document as DocxDocument\n",
"from docx.oxml.table import CT_Tbl\n",
"from docx.oxml.text.paragraph import CT_P\n",
"from docx.table import _Cell, Table\n",
"from docx.text.paragraph import Paragraph\n",
"\n",
"\n",
"from pptx import Presentation\n",
"\n",
"# Alibaba OCR\n",
"from alibabacloud_ocr_api20210707.client import Client as ocr_api20210707Client\n",
"from alibabacloud_tea_openapi import models as open_api_models\n",
"from alibabacloud_ocr_api20210707 import models as ocr_api_20210707_models\n",
"from alibabacloud_tea_util import models as util_models\n",
"\n",
"os.environ[\"ALIBABA_CLOUD_ACCESS_KEY_ID\"] = \"LTAI5t8vdvFnes9cXbcPrcby\"\n",
"os.environ[\"ALIBABA_CLOUD_ACCESS_KEY_SECRET\"] = \"gHnnFli9iV3MPxFvdEGTZbKlwfZW0e\"\n",
"\n",
"# =========================\n",
"# 通用工具\n",
"# =========================\n",
"def safe_name(name: str) -> str:\n",
" name = (name or \"\").strip()\n",
" name = re.sub(r\"[\\\\/:*?\\\"<>|]+\", \"_\", name)\n",
" return name[:150] if len(name) > 150 else name\n",
"\n",
"\n",
"def ensure_dir(p: Path):\n",
" p.mkdir(parents=True, exist_ok=True)\n",
"\n",
"\n",
"def guess_ext_from_content_type(content_type: str) -> str:\n",
" ct = (content_type or \"\").lower()\n",
" if \"jpeg\" in ct or \"jpg\" in ct:\n",
" return \".jpg\"\n",
" if \"png\" in ct:\n",
" return \".png\"\n",
" if \"bmp\" in ct:\n",
" return \".bmp\"\n",
" if \"tiff\" in ct or \"tif\" in ct:\n",
" return \".tif\"\n",
" if \"gif\" in ct:\n",
" return \".gif\"\n",
" return \".bin\"\n",
"\n",
"\n",
"def dedup_keep_order(lines: List[str]) -> List[str]:\n",
" seen = set()\n",
" out = []\n",
" for s in lines:\n",
" k = s.strip()\n",
" if not k:\n",
" continue\n",
" if k in seen:\n",
" continue\n",
" seen.add(k)\n",
" out.append(s)\n",
" return out\n",
"\n",
"\n",
"# =========================\n",
"# OCR图片标准化 + Alibaba OCR输出“纯文本”\n",
"# =========================\n",
"MIN_BYTES = 200\n",
"MAX_SIDE = 3000\n",
"MIN_FILE_SIZE = 5 * 1024 # 5KB\n",
"\n",
"\n",
"def _flatten_strings(obj: Any) -> Iterable[str]:\n",
" \"\"\"兜底:递归把 dict/list 里所有字符串叶子都拿出来\"\"\"\n",
" if obj is None:\n",
" return\n",
" if isinstance(obj, str):\n",
" yield obj\n",
" elif isinstance(obj, dict):\n",
" for v in obj.values():\n",
" yield from _flatten_strings(v)\n",
" elif isinstance(obj, list):\n",
" for it in obj:\n",
" yield from _flatten_strings(it)\n",
"\n",
"\n",
"def ocr_data_to_text(data: Any) -> str:\n",
" \"\"\"\n",
" 把阿里 OCR 返回的 data可能是 dict/list/str尽量转换成可读纯文本。\n",
" 兼容多种可能结构;如果识别不到明确字段,就降级为抽取所有字符串并做轻度清洗。\n",
" \"\"\"\n",
" if data is None:\n",
" return \"\"\n",
"\n",
" # 如果是字符串且像 JSON尝试 parse\n",
" if isinstance(data, str):\n",
" s = data.strip()\n",
" # 有些 SDK 直接 str(dict),也有可能是 JSON 字符串\n",
" if (s.startswith(\"{\") and s.endswith(\"}\")) or (s.startswith(\"[\") and s.endswith(\"]\")):\n",
" try:\n",
" data = json.loads(s)\n",
" except Exception:\n",
" # 不是严格 JSON继续当普通字符串\n",
" return s\n",
" else:\n",
" return s\n",
"\n",
" # 常见:顶层 dict\n",
" if isinstance(data, dict):\n",
" # 1) 直接有 content/text 字段\n",
" for k in (\"content\", \"text\", \"result\", \"data\"):\n",
" v = data.get(k)\n",
" if isinstance(v, str) and v.strip():\n",
" return v.strip()\n",
"\n",
" # 2) 常见字段prism_wordsInfo / wordsInfo / words_info\n",
" # 每个 item 里可能是 word / text / content\n",
" for k in (\"prism_wordsInfo\", \"wordsInfo\", \"words_info\", \"words\", \"lines\"):\n",
" v = data.get(k)\n",
" if isinstance(v, list) and v:\n",
" lines = []\n",
" for item in v:\n",
" if isinstance(item, dict):\n",
" for kk in (\"word\", \"text\", \"content\", \"value\"):\n",
" if isinstance(item.get(kk), str) and item.get(kk).strip():\n",
" lines.append(item[kk].strip())\n",
" break\n",
" lines = dedup_keep_order(lines)\n",
" if lines:\n",
" return \"\\n\".join(lines)\n",
"\n",
" # 3) 有些会返回 paragraphs/blocks\n",
" for k in (\"paragraphs\", \"blocks\"):\n",
" v = data.get(k)\n",
" if isinstance(v, list) and v:\n",
" lines = []\n",
" for b in v:\n",
" if isinstance(b, dict):\n",
" # block 里可能有 text/content/lines\n",
" for kk in (\"text\", \"content\"):\n",
" if isinstance(b.get(kk), str) and b[kk].strip():\n",
" lines.append(b[kk].strip())\n",
" break\n",
" else:\n",
" # 或者 b['lines'] 是 list\n",
" if isinstance(b.get(\"lines\"), list):\n",
" for ln in b[\"lines\"]:\n",
" if isinstance(ln, dict):\n",
" for kk2 in (\"text\", \"content\", \"word\"):\n",
" if isinstance(ln.get(kk2), str) and ln[kk2].strip():\n",
" lines.append(ln[kk2].strip())\n",
" break\n",
" lines = [x for x in lines if x.strip()]\n",
" if lines:\n",
" return \"\\n\".join(lines)\n",
"\n",
" # 兜底:把所有字符串叶子抽出来\n",
" all_str = [s.strip() for s in _flatten_strings(data) if isinstance(s, str) and s.strip()]\n",
" # 过滤掉明显无意义的短 token可按需调整\n",
" all_str = [s for s in all_str if len(s) >= 2]\n",
" all_str = dedup_keep_order(all_str)\n",
" return \"\\n\".join(all_str)\n",
"\n",
" # 顶层 list尝试当作行列表\n",
" if isinstance(data, list):\n",
" lines = []\n",
" for it in data:\n",
" if isinstance(it, str) and it.strip():\n",
" lines.append(it.strip())\n",
" elif isinstance(it, dict):\n",
" for kk in (\"text\", \"content\", \"word\", \"value\"):\n",
" if isinstance(it.get(kk), str) and it[kk].strip():\n",
" lines.append(it[kk].strip())\n",
" break\n",
" lines = dedup_keep_order(lines)\n",
" if lines:\n",
" return \"\\n\".join(lines)\n",
" # 再兜底\n",
" all_str = [s.strip() for s in _flatten_strings(data) if isinstance(s, str) and s.strip()]\n",
" all_str = [s for s in all_str if len(s) >= 2]\n",
" all_str = dedup_keep_order(all_str)\n",
" return \"\\n\".join(all_str)\n",
"\n",
" # 其他类型兜底\n",
" return str(data).strip()\n",
"\n",
"\n",
"class AlibabaOCR:\n",
" def __init__(self, endpoint: str = \"ocr-api.cn-hangzhou.aliyuncs.com\"):\n",
" self.endpoint = endpoint\n",
" self.client = self._try_create_client()\n",
"\n",
" def _try_create_client(self) -> Optional[ocr_api20210707Client]:\n",
" ak = os.environ.get(\"ALIBABA_CLOUD_ACCESS_KEY_ID\")\n",
" sk = os.environ.get(\"ALIBABA_CLOUD_ACCESS_KEY_SECRET\")\n",
" if not ak or not sk:\n",
" return None\n",
" config = open_api_models.Config(access_key_id=ak, access_key_secret=sk)\n",
" config.endpoint = self.endpoint\n",
" return ocr_api20210707Client(config)\n",
"\n",
" @staticmethod\n",
" def normalize_image_to_jpeg_bytes(image_bytes: bytes) -> Optional[bytes]:\n",
" try:\n",
" with Image.open(BytesIO(image_bytes)) as img:\n",
" img = ImageOps.exif_transpose(img)\n",
" if img.mode != \"RGB\":\n",
" img = img.convert(\"RGB\")\n",
"\n",
" w, h = img.size\n",
" m = max(w, h)\n",
" if m > MAX_SIDE:\n",
" scale = MAX_SIDE / float(m)\n",
" img = img.resize((int(w * scale), int(h * scale)))\n",
"\n",
" buf = BytesIO()\n",
" img.save(buf, format=\"JPEG\", quality=90, optimize=True)\n",
" data = buf.getvalue()\n",
" if len(data) < MIN_BYTES:\n",
" return None\n",
" return data\n",
" except Exception:\n",
" return None\n",
"\n",
" def ocr_bytes_text(self, image_jpeg_bytes: bytes, ocr_type: str = \"General\") -> Optional[str]:\n",
" \"\"\"\n",
" 返回可读纯文本(已经做过结构抽取和清洗)。\n",
" \"\"\"\n",
" if not self.client:\n",
" return None\n",
" req = ocr_api_20210707_models.RecognizeAllTextRequest(body=image_jpeg_bytes, type=ocr_type)\n",
" runtime = util_models.RuntimeOptions()\n",
" try:\n",
" resp = self.client.recognize_all_text_with_options(req, runtime)\n",
" data = getattr(resp.body, \"data\", None)\n",
" text = ocr_data_to_text(data)\n",
" text = (text or \"\").strip()\n",
" return text if text else None\n",
" except Exception:\n",
" return None\n",
"\n",
"\n",
"# =========================\n",
"# DOCX按内容顺序抽取 + 图片 OCR\n",
"# =========================\n",
"_DOCX_NS = {\n",
" \"a\": \"http://schemas.openxmlformats.org/drawingml/2006/main\",\n",
" \"r\": \"http://schemas.openxmlformats.org/officeDocument/2006/relationships\",\n",
" \"w\": \"http://schemas.openxmlformats.org/wordprocessingml/2006/main\",\n",
" \"wp\": \"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing\",\n",
" \"pic\": \"http://schemas.openxmlformats.org/drawingml/2006/picture\",\n",
"}\n",
"\n",
"\n",
"# def _iter_docx_block_items(parent) -> List[Any]:\n",
"# if isinstance(parent, Document):\n",
"# parent_elm = parent.element.body\n",
"# else:\n",
"# parent_elm = parent._tc\n",
"\n",
"# out = []\n",
"# for child in parent_elm.iterchildren():\n",
"# if isinstance(child, CT_P):\n",
"# out.append(Paragraph(child, parent))\n",
"# elif isinstance(child, CT_Tbl):\n",
"# out.append(Table(child, parent))\n",
"# return out\n",
"def _iter_docx_block_items(parent) -> List[Any]:\n",
" \"\"\"\n",
" 按 body 顺序返回 Paragraph / Table\n",
" parent 可以是 DocxDocument 或 _Cell\n",
" \"\"\"\n",
" if isinstance(parent, DocxDocument):\n",
" parent_elm = parent.element.body\n",
" else:\n",
" parent_elm = parent._tc # _Cell\n",
"\n",
" out = []\n",
" for child in parent_elm.iterchildren():\n",
" if isinstance(child, CT_P):\n",
" out.append(Paragraph(child, parent))\n",
" elif isinstance(child, CT_Tbl):\n",
" out.append(Table(child, parent))\n",
" return out\n",
"\n",
"\n",
"\n",
"def _docx_paragraph_text(p: Paragraph) -> str:\n",
" return (p.text or \"\").strip()\n",
"\n",
"\n",
"def _docx_table_text(tbl: Table) -> List[str]:\n",
" lines = []\n",
" for row in tbl.rows:\n",
" row_text = []\n",
" for cell in row.cells:\n",
" cell_t = (cell.text or \"\").strip()\n",
" if cell_t:\n",
" row_text.append(cell_t)\n",
" if row_text:\n",
" lines.append(\"\\t\".join(row_text))\n",
" return lines\n",
"\n",
"\n",
"# def _docx_find_image_rids_in_paragraph(p: Paragraph) -> List[str]:\n",
"# rids = []\n",
"# blips = p._p.xpath(\".//a:blip\", namespaces=_DOCX_NS)\n",
"# for blip in blips:\n",
"# rid = blip.get(f\"{{{_DOCX_NS['r']}}}embed\")\n",
"# if rid:\n",
"# rids.append(rid)\n",
"# return rids\n",
"def _docx_find_image_rids_in_paragraph(p: Paragraph) -> List[str]:\n",
" \"\"\"\n",
" 在段落 XML 里按出现顺序找图片 rIda:blip 的 r:embed / r:link\n",
" 不用 namespaces 参数,兼容 python-docx 的 xpath 实现。\n",
" \"\"\"\n",
" rids: List[str] = []\n",
"\n",
" # 找所有 blip 节点(不依赖命名空间前缀)\n",
" blips = p._p.xpath('.//*[local-name()=\"blip\"]')\n",
" for blip in blips:\n",
" # embed 或 link 都可能承载关系 id\n",
" rid = blip.get(\n",
" '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed'\n",
" ) or blip.get(\n",
" '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}link'\n",
" )\n",
" if rid:\n",
" rids.append(rid)\n",
"\n",
" return rids\n",
"\n",
"\n",
"\n",
"def extract_docx_in_order(docx_path: Path, ocr: AlibabaOCR, save_images_dir: Optional[Path] = None) -> str:\n",
" # doc = Document(str(docx_path))\n",
" doc = docx_open(str(docx_path))\n",
" rels = doc.part.rels\n",
"\n",
" out_lines: List[str] = []\n",
" img_counter = 0\n",
"\n",
" def handle_paragraph(p: Paragraph):\n",
" nonlocal img_counter\n",
" t = _docx_paragraph_text(p)\n",
" if t:\n",
" out_lines.append(t)\n",
"\n",
" rids = _docx_find_image_rids_in_paragraph(p)\n",
" for rid in rids:\n",
" rel = rels.get(rid)\n",
" if not rel:\n",
" continue\n",
" part = rel.target_part\n",
" blob = getattr(part, \"blob\", None)\n",
" if not blob or len(blob) < MIN_FILE_SIZE:\n",
" continue\n",
"\n",
" img_counter += 1\n",
" content_type = getattr(part, \"content_type\", \"\")\n",
" ext = guess_ext_from_content_type(content_type)\n",
"\n",
" if save_images_dir:\n",
" ensure_dir(save_images_dir)\n",
" (save_images_dir / f\"img_{img_counter:04d}{ext}\").write_bytes(blob)\n",
"\n",
" norm = ocr.normalize_image_to_jpeg_bytes(blob)\n",
" if not norm or len(norm) < MIN_FILE_SIZE:\n",
" out_lines.append(f\"[图片 {img_counter:04d}](跳过:规范化后太小/无效)\")\n",
" continue\n",
"\n",
" text = ocr.ocr_bytes_text(norm, ocr_type=\"General\")\n",
" print(text)\n",
" if text:\n",
" out_lines.append(f\"[图片 {img_counter:04d} 识别文本]\")\n",
" out_lines.append(text)\n",
" else:\n",
" out_lines.append(f\"[图片 {img_counter:04d}]OCR 失败或未配置 AK/SK\")\n",
"\n",
" def handle_table(tbl: Table):\n",
" for line in _docx_table_text(tbl):\n",
" out_lines.append(line)\n",
"\n",
" for row in tbl.rows:\n",
" for cell in row.cells:\n",
" cell: _Cell\n",
" for item in _iter_docx_block_items(cell):\n",
" if isinstance(item, Paragraph):\n",
" handle_paragraph(item)\n",
" elif isinstance(item, Table):\n",
" handle_table(item)\n",
"\n",
" for item in _iter_docx_block_items(doc):\n",
" if isinstance(item, Paragraph):\n",
" handle_paragraph(item)\n",
" elif isinstance(item, Table):\n",
" handle_table(item)\n",
"\n",
" return \"\\n\".join(out_lines)\n",
"\n",
"\n",
"# =========================\n",
"# PPTX按坐标阅读顺序排序 shapes + 图片 OCR\n",
"# =========================\n",
"def _shape_sort_key(shape, y_tol_emu: int = 20000) -> Tuple[int, int, int]:\n",
" \"\"\"\n",
" y_tol_emu行容差EMU 单位。PowerPoint 1 inch = 914400 EMU。\n",
" 20000 EMU ~ 0.022 inch属于很小的容差用来把“同一行”的微差抹平。\n",
" \"\"\"\n",
" try:\n",
" top = int(getattr(shape, \"top\", 0))\n",
" except Exception:\n",
" top = 0\n",
" try:\n",
" left = int(getattr(shape, \"left\", 0))\n",
" except Exception:\n",
" left = 0\n",
"\n",
" # 把 top 按容差分桶,保证同一行内部按 left 排序更稳定\n",
" top_bucket = (top // y_tol_emu) * y_tol_emu\n",
" # 额外用 shape_id 稳定排序\n",
" sid = int(getattr(shape, \"shape_id\", 0))\n",
" return (top_bucket, left, sid)\n",
"\n",
"\n",
"def extract_pptx_in_order(pptx_path: Path, ocr: AlibabaOCR, save_images_dir: Optional[Path] = None) -> str:\n",
" prs = Presentation(str(pptx_path))\n",
" out_lines: List[str] = []\n",
" img_counter = 0\n",
"\n",
" for s_idx, slide in enumerate(prs.slides, 1):\n",
" out_lines.append(f\"=== 第 {s_idx} 页Slide {s_idx}===\")\n",
"\n",
" shapes = list(slide.shapes)\n",
" shapes.sort(key=_shape_sort_key)\n",
"\n",
" for shape in shapes:\n",
" # 文本\n",
" if hasattr(shape, \"text\"):\n",
" t = (shape.text or \"\").strip()\n",
" if t:\n",
" out_lines.append(t)\n",
"\n",
" # 图片shape_type=13\n",
" if getattr(shape, \"shape_type\", None) == 13:\n",
" try:\n",
" img = shape.image\n",
" blob = img.blob\n",
" if not blob or len(blob) < MIN_FILE_SIZE:\n",
" continue\n",
"\n",
" img_counter += 1\n",
" ext = (img.ext or \"png\").lower()\n",
" if ext == \"jpeg\":\n",
" ext = \"jpg\"\n",
"\n",
" if save_images_dir:\n",
" ensure_dir(save_images_dir)\n",
" (save_images_dir / f\"slide{s_idx:02d}_img_{img_counter:04d}.{ext}\").write_bytes(blob)\n",
"\n",
" norm = ocr.normalize_image_to_jpeg_bytes(blob)\n",
" if not norm or len(norm) < MIN_FILE_SIZE:\n",
" out_lines.append(f\"[图片 {img_counter:04d}](跳过:规范化后太小/无效)\")\n",
" continue\n",
"\n",
" text = ocr.ocr_bytes_text(norm, ocr_type=\"General\")\n",
" if text:\n",
" out_lines.append(f\"[图片 {img_counter:04d} 识别文本]\")\n",
" out_lines.append(text)\n",
" else:\n",
" out_lines.append(f\"[图片 {img_counter:04d}]OCR 失败或未配置 AK/SK\")\n",
"\n",
" except Exception:\n",
" continue\n",
"\n",
" return \"\\n\".join(out_lines)\n",
"\n",
"\n",
"# =========================\n",
"# PDF按页面 block 坐标排序输出(文本/图片混排)+ 图片 OCR\n",
"# =========================\n",
"def _pdf_blocks_in_reading_order(page_dict: Dict[str, Any]) -> List[Dict[str, Any]]:\n",
" blocks = page_dict.get(\"blocks\", []) or []\n",
"\n",
" def key_fn(b):\n",
" bbox = b.get(\"bbox\") or [0, 0, 0, 0]\n",
" # y0, x0\n",
" return (round(bbox[1], 3), round(bbox[0], 3))\n",
"\n",
" return sorted(blocks, key=key_fn)\n",
"\n",
"\n",
"def _pdf_text_from_block(block: Dict[str, Any]) -> str:\n",
" lines = []\n",
" for line in block.get(\"lines\", []) or []:\n",
" spans = line.get(\"spans\", []) or []\n",
" # span text 拼接成行\n",
" txt = \"\".join((s.get(\"text\") or \"\") for s in spans).strip()\n",
" if txt:\n",
" lines.append(txt)\n",
" return \"\\n\".join(lines).strip()\n",
"\n",
"\n",
"def extract_pdf_in_order(pdf_path: Path, ocr: AlibabaOCR, save_images_dir: Optional[Path] = None) -> str:\n",
" doc = fitz.open(str(pdf_path))\n",
" out_lines: List[str] = []\n",
" img_counter = 0\n",
"\n",
" for p_idx in range(len(doc)):\n",
" page = doc[p_idx]\n",
" out_lines.append(f\"=== 第 {p_idx + 1} 页Page {p_idx + 1}===\")\n",
"\n",
" d = page.get_text(\"dict\")\n",
" blocks = _pdf_blocks_in_reading_order(d)\n",
"\n",
" for b in blocks:\n",
" btype = b.get(\"type\")\n",
" if btype == 0:\n",
" t = _pdf_text_from_block(b)\n",
" if t:\n",
" out_lines.append(t)\n",
" elif btype == 1:\n",
" xref = b.get(\"xref\")\n",
" if not xref:\n",
" continue\n",
" try:\n",
" base = doc.extract_image(xref)\n",
" except Exception:\n",
" continue\n",
" if not base:\n",
" continue\n",
"\n",
" blob = base.get(\"image\")\n",
" ext = (base.get(\"ext\") or \"png\").lower()\n",
" if ext == \"jpeg\":\n",
" ext = \"jpg\"\n",
"\n",
" if not blob or len(blob) < MIN_FILE_SIZE:\n",
" continue\n",
"\n",
" img_counter += 1\n",
"\n",
" if save_images_dir:\n",
" ensure_dir(save_images_dir)\n",
" (save_images_dir / f\"page{p_idx+1:03d}_img_{img_counter:04d}.{ext}\").write_bytes(blob)\n",
"\n",
" norm = ocr.normalize_image_to_jpeg_bytes(blob)\n",
" if not norm or len(norm) < MIN_FILE_SIZE:\n",
" out_lines.append(f\"[图片 {img_counter:04d}](跳过:规范化后太小/无效)\")\n",
" continue\n",
"\n",
" text = ocr.ocr_bytes_text(norm, ocr_type=\"General\")\n",
" # print(text)\n",
" if text:\n",
" out_lines.append(f\"[图片 {img_counter:04d} 识别文本]\")\n",
" out_lines.append(text)\n",
" else:\n",
" out_lines.append(f\"[图片 {img_counter:04d}]OCR 失败或未配置 AK/SK\")\n",
"\n",
" doc.close()\n",
" return \"\\n\".join(out_lines)\n",
"\n",
"\n",
"# =========================\n",
"# 单文件主入口:按内容顺序输出到 txt\n",
"# =========================\n",
"def extract_single_file_to_txt(\n",
" file_path: str,\n",
" output_dir: str = \"output_single\",\n",
" save_images: bool = False,\n",
") -> str:\n",
" p = Path(file_path)\n",
" if not p.exists() or not p.is_file():\n",
" raise RuntimeError(f\"文件不存在:{file_path}\")\n",
"\n",
" ext = p.suffix.lower()\n",
" if ext not in (\".pdf\", \".docx\", \".pptx\"):\n",
" raise RuntimeError(f\"不支持的文件类型:{ext}(仅支持 pdf/docx/pptx\")\n",
"\n",
" out_root = Path(output_dir)\n",
" ensure_dir(out_root)\n",
"\n",
" base = safe_name(p.stem)\n",
" out_txt = out_root / f\"{base}{ext}.txt\"\n",
" print(out_root)\n",
"\n",
" img_dir = None\n",
" if save_images:\n",
" img_dir = out_root / f\"{base}_images\"\n",
" ensure_dir(img_dir)\n",
"\n",
" ocr = AlibabaOCR()\n",
"\n",
" if ext == \".docx\":\n",
" content = extract_docx_in_order(p, ocr=ocr, save_images_dir=img_dir)\n",
" elif ext == \".pptx\":\n",
" content = extract_pptx_in_order(p, ocr=ocr, save_images_dir=img_dir)\n",
" else:\n",
" content = extract_pdf_in_order(p, ocr=ocr, save_images_dir=img_dir)\n",
"\n",
" out_txt.write_text(content or \"\", encoding=\"utf-8\")\n",
" return str(out_txt.resolve())\n",
"\n",
"\n",
"# if __name__ == \"__main__\":\n",
"# if len(sys.argv) < 2:\n",
"# print(\"用法python extract_one.py <file_path> [output_dir] [save_images(0/1)]\")\n",
"# print(\"示例python extract_one.py ./a.pdf ./out 1\")\n",
"# sys.exit(1)\n",
"\n",
"# file_path = sys.argv[1]\n",
"# output_dir = sys.argv[2] if len(sys.argv) >= 3 else \"output_single\"\n",
"# save_images = bool(int(sys.argv[3])) if len(sys.argv) >= 4 else False\n",
"\n",
"# out_txt = extract_single_file_to_txt(file_path, output_dir=output_dir, save_images=save_images)\n",
"# print(f\"完成。输出文件:{out_txt}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "58e59464-9ab6-4128-8a00-36e881b9bc1d",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 28,
"id": "d57d38e0-6c96-4323-8dd2-2ae2d67a7995",
"metadata": {},
"outputs": [],
"source": [
"# -*- coding: utf-8 -*-\n",
"import os\n",
"import re\n",
"import sys\n",
"import json\n",
"from io import BytesIO\n",
"from pathlib import Path\n",
"from typing import List, Optional, Tuple, Dict, Any, Iterable\n",
"\n",
"import fitz # PyMuPDF\n",
"from PIL import Image, ImageOps\n",
"\n",
"# from docx import Document\n",
"# from docx.oxml.table import CT_Tbl\n",
"# from docx.oxml.text.paragraph import CT_P\n",
"# from docx.table import _Cell, Table\n",
"# from docx.text.paragraph import Paragraph\n",
"from docx import Document as docx_open\n",
"from docx.document import Document as DocxDocument\n",
"from docx.oxml.table import CT_Tbl\n",
"from docx.oxml.text.paragraph import CT_P\n",
"from docx.table import _Cell, Table\n",
"from docx.text.paragraph import Paragraph\n",
"\n",
"\n",
"from pptx import Presentation\n",
"\n",
"# Alibaba OCR\n",
"from alibabacloud_ocr_api20210707.client import Client as ocr_api20210707Client\n",
"from alibabacloud_tea_openapi import models as open_api_models\n",
"from alibabacloud_ocr_api20210707 import models as ocr_api_20210707_models\n",
"from alibabacloud_tea_util import models as util_models\n",
"\n",
"# os.environ[\"ALIBABA_CLOUD_ACCESS_KEY_ID\"] = \"LTAI5t8vdvFnes9cXbcPrcby\"\n",
"# os.environ[\"ALIBABA_CLOUD_ACCESS_KEY_SECRET\"] = \"gHnnFli9iV3MPxFvdEGTZbKlwfZW0e\"\n",
"os.environ[\"ALIBABA_CLOUD_ACCESS_KEY_ID\"] = \"LTAI5t97pmvKirm9n6ygCHyY\"\n",
"os.environ[\"ALIBABA_CLOUD_ACCESS_KEY_SECRET\"] = \"sCFIbuEyf8RUPT1C9kS9CRxHzc7WAl\"\n",
"\n",
"# =========================\n",
"# 通用工具\n",
"# =========================\n",
"def safe_name(name: str) -> str:\n",
" name = (name or \"\").strip()\n",
" name = re.sub(r\"[\\\\/:*?\\\"<>|]+\", \"_\", name)\n",
" return name[:150] if len(name) > 150 else name\n",
"\n",
"\n",
"def ensure_dir(p: Path):\n",
" p.mkdir(parents=True, exist_ok=True)\n",
"\n",
"\n",
"def guess_ext_from_content_type(content_type: str) -> str:\n",
" ct = (content_type or \"\").lower()\n",
" if \"jpeg\" in ct or \"jpg\" in ct:\n",
" return \".jpg\"\n",
" if \"png\" in ct:\n",
" return \".png\"\n",
" if \"bmp\" in ct:\n",
" return \".bmp\"\n",
" if \"tiff\" in ct or \"tif\" in ct:\n",
" return \".tif\"\n",
" if \"gif\" in ct:\n",
" return \".gif\"\n",
" return \".bin\"\n",
"\n",
"\n",
"def dedup_keep_order(lines: List[str]) -> List[str]:\n",
" seen = set()\n",
" out = []\n",
" for s in lines:\n",
" k = s.strip()\n",
" if not k:\n",
" continue\n",
" if k in seen:\n",
" continue\n",
" seen.add(k)\n",
" out.append(s)\n",
" return out\n",
"\n",
"\n",
"# =========================\n",
"# OCR图片标准化 + Alibaba OCR输出“纯文本”\n",
"# =========================\n",
"MIN_BYTES = 200\n",
"MAX_SIDE = 3000\n",
"MIN_FILE_SIZE = 5 * 1024 # 5KB\n",
"\n",
"\n",
"def _flatten_strings(obj: Any) -> Iterable[str]:\n",
" \"\"\"兜底:递归把 dict/list 里所有字符串叶子都拿出来\"\"\"\n",
" if obj is None:\n",
" return\n",
" if isinstance(obj, str):\n",
" yield obj\n",
" elif isinstance(obj, dict):\n",
" for v in obj.values():\n",
" yield from _flatten_strings(v)\n",
" elif isinstance(obj, list):\n",
" for it in obj:\n",
" yield from _flatten_strings(it)\n",
"\n",
"\n",
"def ocr_data_to_text(data: Any) -> str:\n",
" \"\"\"\n",
" 把阿里 OCR 返回的 data可能是 dict/list/str尽量转换成可读纯文本。\n",
" 兼容多种可能结构;如果识别不到明确字段,就降级为抽取所有字符串并做轻度清洗。\n",
" \"\"\"\n",
" if data is None:\n",
" return \"\"\n",
"\n",
" # 如果是字符串且像 JSON尝试 parse\n",
" if isinstance(data, str):\n",
" s = data.strip()\n",
" # 有些 SDK 直接 str(dict),也有可能是 JSON 字符串\n",
" if (s.startswith(\"{\") and s.endswith(\"}\")) or (s.startswith(\"[\") and s.endswith(\"]\")):\n",
" try:\n",
" data = json.loads(s)\n",
" except Exception:\n",
" # 不是严格 JSON继续当普通字符串\n",
" return s\n",
" else:\n",
" return s\n",
"\n",
" # 常见:顶层 dict\n",
" if isinstance(data, dict):\n",
" # 1) 直接有 content/text 字段\n",
" for k in (\"content\", \"text\", \"result\", \"data\"):\n",
" v = data.get(k)\n",
" if isinstance(v, str) and v.strip():\n",
" return v.strip()\n",
"\n",
" # 2) 常见字段prism_wordsInfo / wordsInfo / words_info\n",
" # 每个 item 里可能是 word / text / content\n",
" for k in (\"prism_wordsInfo\", \"wordsInfo\", \"words_info\", \"words\", \"lines\"):\n",
" v = data.get(k)\n",
" if isinstance(v, list) and v:\n",
" lines = []\n",
" for item in v:\n",
" if isinstance(item, dict):\n",
" for kk in (\"word\", \"text\", \"content\", \"value\"):\n",
" if isinstance(item.get(kk), str) and item.get(kk).strip():\n",
" lines.append(item[kk].strip())\n",
" break\n",
" lines = dedup_keep_order(lines)\n",
" if lines:\n",
" return \"\\n\".join(lines)\n",
"\n",
" # 3) 有些会返回 paragraphs/blocks\n",
" for k in (\"paragraphs\", \"blocks\"):\n",
" v = data.get(k)\n",
" if isinstance(v, list) and v:\n",
" lines = []\n",
" for b in v:\n",
" if isinstance(b, dict):\n",
" # block 里可能有 text/content/lines\n",
" for kk in (\"text\", \"content\"):\n",
" if isinstance(b.get(kk), str) and b[kk].strip():\n",
" lines.append(b[kk].strip())\n",
" break\n",
" else:\n",
" # 或者 b['lines'] 是 list\n",
" if isinstance(b.get(\"lines\"), list):\n",
" for ln in b[\"lines\"]:\n",
" if isinstance(ln, dict):\n",
" for kk2 in (\"text\", \"content\", \"word\"):\n",
" if isinstance(ln.get(kk2), str) and ln[kk2].strip():\n",
" lines.append(ln[kk2].strip())\n",
" break\n",
" lines = [x for x in lines if x.strip()]\n",
" if lines:\n",
" return \"\\n\".join(lines)\n",
"\n",
" # 兜底:把所有字符串叶子抽出来\n",
" all_str = [s.strip() for s in _flatten_strings(data) if isinstance(s, str) and s.strip()]\n",
" # 过滤掉明显无意义的短 token可按需调整\n",
" all_str = [s for s in all_str if len(s) >= 2]\n",
" all_str = dedup_keep_order(all_str)\n",
" return \"\\n\".join(all_str)\n",
"\n",
" # 顶层 list尝试当作行列表\n",
" if isinstance(data, list):\n",
" lines = []\n",
" for it in data:\n",
" if isinstance(it, str) and it.strip():\n",
" lines.append(it.strip())\n",
" elif isinstance(it, dict):\n",
" for kk in (\"text\", \"content\", \"word\", \"value\"):\n",
" if isinstance(it.get(kk), str) and it[kk].strip():\n",
" lines.append(it[kk].strip())\n",
" break\n",
" lines = dedup_keep_order(lines)\n",
" if lines:\n",
" return \"\\n\".join(lines)\n",
" # 再兜底\n",
" all_str = [s.strip() for s in _flatten_strings(data) if isinstance(s, str) and s.strip()]\n",
" all_str = [s for s in all_str if len(s) >= 2]\n",
" all_str = dedup_keep_order(all_str)\n",
" return \"\\n\".join(all_str)\n",
"\n",
" # 其他类型兜底\n",
" return str(data).strip()\n",
"\n",
"\n",
"class AlibabaOCR:\n",
" def __init__(self, endpoint: str = \"ocr-api.cn-hangzhou.aliyuncs.com\"):\n",
" self.endpoint = endpoint\n",
" self.client = self._try_create_client()\n",
"\n",
" def _try_create_client(self) -> Optional[ocr_api20210707Client]:\n",
" ak = os.environ.get(\"ALIBABA_CLOUD_ACCESS_KEY_ID\")\n",
" sk = os.environ.get(\"ALIBABA_CLOUD_ACCESS_KEY_SECRET\")\n",
" if not ak or not sk:\n",
" return None\n",
" config = open_api_models.Config(access_key_id=ak, access_key_secret=sk)\n",
" config.endpoint = self.endpoint\n",
" return ocr_api20210707Client(config)\n",
"\n",
" @staticmethod\n",
" def normalize_image_to_jpeg_bytes(image_bytes: bytes) -> Optional[bytes]:\n",
" try:\n",
" with Image.open(BytesIO(image_bytes)) as img:\n",
" img = ImageOps.exif_transpose(img)\n",
" if img.mode != \"RGB\":\n",
" img = img.convert(\"RGB\")\n",
"\n",
" w, h = img.size\n",
" m = max(w, h)\n",
" if m > MAX_SIDE:\n",
" scale = MAX_SIDE / float(m)\n",
" img = img.resize((int(w * scale), int(h * scale)))\n",
"\n",
" buf = BytesIO()\n",
" img.save(buf, format=\"JPEG\", quality=90, optimize=True)\n",
" data = buf.getvalue()\n",
" if len(data) < MIN_BYTES:\n",
" return None\n",
" return data\n",
" except Exception:\n",
" return None\n",
"\n",
" def ocr_bytes_text(self, image_jpeg_bytes: bytes, ocr_type: str = \"General\") -> Optional[str]:\n",
" \"\"\"\n",
" 返回可读纯文本(已经做过结构抽取和清洗)。\n",
" \"\"\"\n",
" if not self.client:\n",
" return None\n",
" req = ocr_api_20210707_models.RecognizeAllTextRequest(body=image_jpeg_bytes, type=ocr_type)\n",
" runtime = util_models.RuntimeOptions()\n",
" try:\n",
" resp = self.client.recognize_all_text_with_options(req, runtime)\n",
" data = getattr(resp.body, \"data\", None)\n",
" text = ocr_data_to_text(data)\n",
" text = (text or \"\").strip()\n",
" return text if text else None\n",
" except Exception:\n",
" return None\n",
"\n",
"\n",
"# =========================\n",
"# DOCX按内容顺序抽取 + 图片 OCR\n",
"# =========================\n",
"_DOCX_NS = {\n",
" \"a\": \"http://schemas.openxmlformats.org/drawingml/2006/main\",\n",
" \"r\": \"http://schemas.openxmlformats.org/officeDocument/2006/relationships\",\n",
" \"w\": \"http://schemas.openxmlformats.org/wordprocessingml/2006/main\",\n",
" \"wp\": \"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing\",\n",
" \"pic\": \"http://schemas.openxmlformats.org/drawingml/2006/picture\",\n",
"}\n",
"\n",
"\n",
"def _iter_docx_block_items(parent) -> List[Any]:\n",
" \"\"\"\n",
" 按 body 顺序返回 Paragraph / Table\n",
" parent 可以是 DocxDocument 或 _Cell\n",
" \"\"\"\n",
" if isinstance(parent, DocxDocument):\n",
" parent_elm = parent.element.body\n",
" else:\n",
" parent_elm = parent._tc # _Cell\n",
"\n",
" out = []\n",
" for child in parent_elm.iterchildren():\n",
" if isinstance(child, CT_P):\n",
" out.append(Paragraph(child, parent))\n",
" elif isinstance(child, CT_Tbl):\n",
" out.append(Table(child, parent))\n",
" return out\n",
"\n",
"\n",
"\n",
"def _docx_paragraph_text(p: Paragraph) -> str:\n",
" return (p.text or \"\").strip()\n",
"\n",
"\n",
"def _docx_table_text(tbl: Table) -> List[str]:\n",
" lines = []\n",
" for row in tbl.rows:\n",
" row_text = []\n",
" for cell in row.cells:\n",
" cell_t = (cell.text or \"\").strip()\n",
" if cell_t:\n",
" row_text.append(cell_t)\n",
" if row_text:\n",
" lines.append(\"\\t\".join(row_text))\n",
" return lines\n",
"\n",
"def _docx_find_image_rids_in_paragraph(p: Paragraph) -> List[str]:\n",
" \"\"\"\n",
" 在段落 XML 里按出现顺序找图片 rIda:blip 的 r:embed / r:link\n",
" 不用 namespaces 参数,兼容 python-docx 的 xpath 实现。\n",
" \"\"\"\n",
" rids: List[str] = []\n",
"\n",
" # 找所有 blip 节点(不依赖命名空间前缀)\n",
" blips = p._p.xpath('.//*[local-name()=\"blip\"]')\n",
" for blip in blips:\n",
" # embed 或 link 都可能承载关系 id\n",
" rid = blip.get(\n",
" '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed'\n",
" ) or blip.get(\n",
" '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}link'\n",
" )\n",
" if rid:\n",
" rids.append(rid)\n",
"\n",
" return rids\n",
"\n",
"\n",
"\n",
"def extract_docx_in_order(docx_path: Path, ocr: AlibabaOCR, save_images_dir: Optional[Path] = None) -> str:\n",
" # doc = Document(str(docx_path))\n",
" doc = docx_open(str(docx_path))\n",
" rels = doc.part.rels\n",
"\n",
" out_lines: List[str] = []\n",
" img_counter = 0\n",
"\n",
" def handle_paragraph(p: Paragraph):\n",
" nonlocal img_counter\n",
" t = _docx_paragraph_text(p)\n",
" if t:\n",
" out_lines.append(t)\n",
"\n",
" rids = _docx_find_image_rids_in_paragraph(p)\n",
" for rid in rids:\n",
" rel = rels.get(rid)\n",
" if not rel:\n",
" continue\n",
" part = rel.target_part\n",
" blob = getattr(part, \"blob\", None)\n",
" if not blob or len(blob) < MIN_FILE_SIZE:\n",
" continue\n",
"\n",
" img_counter += 1\n",
" content_type = getattr(part, \"content_type\", \"\")\n",
" ext = guess_ext_from_content_type(content_type)\n",
"\n",
" if save_images_dir:\n",
" ensure_dir(save_images_dir)\n",
" (save_images_dir / f\"img_{img_counter:04d}{ext}\").write_bytes(blob)\n",
"\n",
" norm = ocr.normalize_image_to_jpeg_bytes(blob)\n",
" if not norm or len(norm) < MIN_FILE_SIZE:\n",
" out_lines.append(f\"[图片 {img_counter:04d}](跳过:规范化后太小/无效)\")\n",
" continue\n",
"\n",
" text = ocr.ocr_bytes_text(norm, ocr_type=\"General\")\n",
" print(text)\n",
" if text:\n",
" out_lines.append(f\"[图片 {img_counter:04d} 识别文本]\")\n",
" out_lines.append(text)\n",
" else:\n",
" out_lines.append(f\"[图片 {img_counter:04d}]OCR 失败或未配置 AK/SK\")\n",
"\n",
" def handle_table(tbl: Table):\n",
" for line in _docx_table_text(tbl):\n",
" out_lines.append(line)\n",
"\n",
" for row in tbl.rows:\n",
" for cell in row.cells:\n",
" cell: _Cell\n",
" for item in _iter_docx_block_items(cell):\n",
" if isinstance(item, Paragraph):\n",
" handle_paragraph(item)\n",
" elif isinstance(item, Table):\n",
" handle_table(item)\n",
"\n",
" for item in _iter_docx_block_items(doc):\n",
" if isinstance(item, Paragraph):\n",
" handle_paragraph(item)\n",
" elif isinstance(item, Table):\n",
" handle_table(item)\n",
"\n",
" return \"\\n\".join(out_lines)\n",
"\n",
"\n",
"# =========================\n",
"# PPTX按坐标阅读顺序排序 shapes + 图片 OCR\n",
"# =========================\n",
"def _shape_sort_key(shape, y_tol_emu: int = 20000) -> Tuple[int, int, int]:\n",
" \"\"\"\n",
" y_tol_emu行容差EMU 单位。PowerPoint 1 inch = 914400 EMU。\n",
" 20000 EMU ~ 0.022 inch属于很小的容差用来把“同一行”的微差抹平。\n",
" \"\"\"\n",
" try:\n",
" top = int(getattr(shape, \"top\", 0))\n",
" except Exception:\n",
" top = 0\n",
" try:\n",
" left = int(getattr(shape, \"left\", 0))\n",
" except Exception:\n",
" left = 0\n",
"\n",
" # 把 top 按容差分桶,保证同一行内部按 left 排序更稳定\n",
" top_bucket = (top // y_tol_emu) * y_tol_emu\n",
" # 额外用 shape_id 稳定排序\n",
" sid = int(getattr(shape, \"shape_id\", 0))\n",
" return (top_bucket, left, sid)\n",
"\n",
"\n",
"def extract_pptx_in_order(pptx_path: Path, ocr: AlibabaOCR, save_images_dir: Optional[Path] = None) -> str:\n",
" prs = Presentation(str(pptx_path))\n",
" out_lines: List[str] = []\n",
" img_counter = 0\n",
"\n",
" for s_idx, slide in enumerate(prs.slides, 1):\n",
" out_lines.append(f\"=== 第 {s_idx} 页Slide {s_idx}===\")\n",
"\n",
" shapes = list(slide.shapes)\n",
" shapes.sort(key=_shape_sort_key)\n",
"\n",
" for shape in shapes:\n",
" # 文本\n",
" if hasattr(shape, \"text\"):\n",
" t = (shape.text or \"\").strip()\n",
" if t:\n",
" out_lines.append(t)\n",
"\n",
" # 图片shape_type=13\n",
" if getattr(shape, \"shape_type\", None) == 13:\n",
" try:\n",
" img = shape.image\n",
" blob = img.blob\n",
" if not blob or len(blob) < MIN_FILE_SIZE:\n",
" continue\n",
"\n",
" img_counter += 1\n",
" ext = (img.ext or \"png\").lower()\n",
" if ext == \"jpeg\":\n",
" ext = \"jpg\"\n",
"\n",
" if save_images_dir:\n",
" ensure_dir(save_images_dir)\n",
" (save_images_dir / f\"slide{s_idx:02d}_img_{img_counter:04d}.{ext}\").write_bytes(blob)\n",
"\n",
" norm = ocr.normalize_image_to_jpeg_bytes(blob)\n",
" if not norm or len(norm) < MIN_FILE_SIZE:\n",
" out_lines.append(f\"[图片 {img_counter:04d}](跳过:规范化后太小/无效)\")\n",
" continue\n",
"\n",
" text = ocr.ocr_bytes_text(norm, ocr_type=\"General\")\n",
" if text:\n",
" out_lines.append(f\"[图片 {img_counter:04d} 识别文本]\")\n",
" out_lines.append(text)\n",
" else:\n",
" out_lines.append(f\"[图片 {img_counter:04d}]OCR 失败或未配置 AK/SK\")\n",
"\n",
" except Exception:\n",
" continue\n",
"\n",
" return \"\\n\".join(out_lines)\n",
"\n",
"\n",
"# =========================\n",
"# PDF按页面 block 坐标排序输出(文本/图片混排)+ 图片 OCR\n",
"# =========================\n",
"def _pdf_blocks_in_reading_order(page_dict: Dict[str, Any]) -> List[Dict[str, Any]]:\n",
" blocks = page_dict.get(\"blocks\", []) or []\n",
"\n",
" def key_fn(b):\n",
" bbox = b.get(\"bbox\") or [0, 0, 0, 0]\n",
" # y0, x0\n",
" return (round(bbox[1], 3), round(bbox[0], 3))\n",
"\n",
" return sorted(blocks, key=key_fn)\n",
"\n",
"\n",
"def _pdf_text_from_block(block: Dict[str, Any]) -> str:\n",
" lines = []\n",
" for line in block.get(\"lines\", []) or []:\n",
" spans = line.get(\"spans\", []) or []\n",
" # span text 拼接成行\n",
" txt = \"\".join((s.get(\"text\") or \"\") for s in spans).strip()\n",
" if txt:\n",
" lines.append(txt)\n",
" return \"\\n\".join(lines).strip()\n",
"\n",
"\n",
"def extract_pdf_in_order(pdf_path: Path, ocr: AlibabaOCR, save_images_dir: Optional[Path] = None) -> str:\n",
" doc = fitz.open(str(pdf_path))\n",
" out_lines: List[str] = []\n",
" img_counter = 0\n",
"\n",
" for p_idx in range(len(doc)):\n",
" page = doc[p_idx]\n",
" out_lines.append(f\"=== 第 {p_idx + 1} 页Page {p_idx + 1}===\")\n",
"\n",
" d = page.get_text(\"dict\")\n",
" blocks = _pdf_blocks_in_reading_order(d)\n",
"\n",
" for b in blocks:\n",
" btype = b.get(\"type\")\n",
" if btype == 0:\n",
" t = _pdf_text_from_block(b)\n",
" if t:\n",
" out_lines.append(t)\n",
" elif btype == 1:\n",
" xref = b.get(\"xref\")\n",
" if not xref:\n",
" continue\n",
" try:\n",
" base = doc.extract_image(xref)\n",
" except Exception:\n",
" continue\n",
" if not base:\n",
" continue\n",
"\n",
" blob = base.get(\"image\")\n",
" ext = (base.get(\"ext\") or \"png\").lower()\n",
" if ext == \"jpeg\":\n",
" ext = \"jpg\"\n",
"\n",
" if not blob or len(blob) < MIN_FILE_SIZE:\n",
" continue\n",
"\n",
" img_counter += 1\n",
"\n",
" if save_images_dir:\n",
" ensure_dir(save_images_dir)\n",
" (save_images_dir / f\"page{p_idx+1:03d}_img_{img_counter:04d}.{ext}\").write_bytes(blob)\n",
"\n",
" norm = ocr.normalize_image_to_jpeg_bytes(blob)\n",
" if not norm or len(norm) < MIN_FILE_SIZE:\n",
" out_lines.append(f\"[图片 {img_counter:04d}](跳过:规范化后太小/无效)\")\n",
" continue\n",
"\n",
" text = ocr.ocr_bytes_text(norm, ocr_type=\"General\")\n",
" # print(text)\n",
" if text:\n",
" out_lines.append(f\"[图片 {img_counter:04d} 识别文本]\")\n",
" out_lines.append(text)\n",
" else:\n",
" out_lines.append(f\"[图片 {img_counter:04d}]OCR 失败或未配置 AK/SK\")\n",
"\n",
" doc.close()\n",
" return \"\\n\".join(out_lines)\n",
"\n",
"\n",
"# =========================\n",
"# 单文件主入口:按内容顺序输出到 txt\n",
"# =========================\n",
"def extract_single_file_to_txt(\n",
" file_path: str,\n",
" output_dir: str = \"output_single\",\n",
" save_images: bool = False,\n",
") -> str:\n",
" p = Path(file_path)\n",
" if not p.exists() or not p.is_file():\n",
" raise RuntimeError(f\"文件不存在:{file_path}\")\n",
"\n",
" ext = p.suffix.lower()\n",
" if ext not in (\".pdf\", \".docx\", \".pptx\"):\n",
" raise RuntimeError(f\"不支持的文件类型:{ext}(仅支持 pdf/docx/pptx\")\n",
"\n",
" out_root = Path(output_dir)\n",
" ensure_dir(out_root)\n",
"\n",
" base = safe_name(p.stem)\n",
" out_txt = out_root / f\"{base}{ext}.txt\"\n",
" print(out_root)\n",
"\n",
" img_dir = None\n",
" if save_images:\n",
" img_dir = out_root / f\"{base}_images\"\n",
" ensure_dir(img_dir)\n",
"\n",
" ocr = AlibabaOCR()\n",
"\n",
" if ext == \".docx\":\n",
" content = extract_docx_in_order(p, ocr=ocr, save_images_dir=img_dir)\n",
" elif ext == \".pptx\":\n",
" content = extract_pptx_in_order(p, ocr=ocr, save_images_dir=img_dir)\n",
" else:\n",
" content = extract_pdf_in_order(p, ocr=ocr, save_images_dir=img_dir)\n",
"\n",
" out_txt.write_text(content or \"\", encoding=\"utf-8\")\n",
" return str(out_txt.resolve())\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "dda2a312-60c4-4521-a533-f2fde0b79c09",
"metadata": {},
"outputs": [],
"source": [
"# -*- coding: utf-8 -*-\n",
"import os\n",
"import re\n",
"import sys\n",
"import json\n",
"import ast\n",
"from typing import Union\n",
"from io import BytesIO\n",
"from pathlib import Path\n",
"from typing import List, Optional, Tuple, Dict, Any\n",
"\n",
"import fitz # PyMuPDF\n",
"from PIL import Image, ImageOps\n",
"\n",
"from docx import Document as docx_open\n",
"from docx.document import Document as DocxDocument\n",
"from docx.oxml.table import CT_Tbl\n",
"from docx.oxml.text.paragraph import CT_P\n",
"from docx.table import _Cell, Table\n",
"from docx.text.paragraph import Paragraph\n",
"\n",
"from pptx import Presentation\n",
"\n",
"# Alibaba OCR\n",
"from alibabacloud_ocr_api20210707.client import Client as ocr_api20210707Client\n",
"from alibabacloud_tea_openapi import models as open_api_models\n",
"from alibabacloud_ocr_api20210707 import models as ocr_api_20210707_models\n",
"from alibabacloud_tea_util import models as util_models\n",
"\n",
"\n",
"# =========================\n",
"# 安全提醒:不要在代码里硬编码 AK/SK\n",
"# =========================\n",
"\n",
"\n",
"# =========================\n",
"# 通用工具\n",
"# =========================\n",
"def safe_name(name: str) -> str:\n",
" name = (name or \"\").strip()\n",
" name = re.sub(r\"[\\\\/:*?\\\"<>|]+\", \"_\", name)\n",
" return name[:150] if len(name) > 150 else name\n",
"\n",
"\n",
"def ensure_dir(p: Path):\n",
" p.mkdir(parents=True, exist_ok=True)\n",
"\n",
"\n",
"def guess_ext_from_content_type(content_type: str) -> str:\n",
" ct = (content_type or \"\").lower()\n",
" if \"jpeg\" in ct or \"jpg\" in ct:\n",
" return \".jpg\"\n",
" if \"png\" in ct:\n",
" return \".png\"\n",
" if \"bmp\" in ct:\n",
" return \".bmp\"\n",
" if \"tiff\" in ct or \"tif\" in ct:\n",
" return \".tif\"\n",
" if \"gif\" in ct:\n",
" return \".gif\"\n",
" return \".bin\"\n",
"\n",
"\n",
"# =========================\n",
"# PrettyWriter更排版友好的输出模板\n",
"# =========================\n",
"def _clean_text_block(text: str) -> str:\n",
" \"\"\"轻度清洗:去多余空行、行尾空格、行内连续空白压成单空格(不破坏换行)\"\"\"\n",
" if not text:\n",
" return \"\"\n",
" lines = [ln.rstrip() for ln in str(text).splitlines()]\n",
"\n",
" # 去首尾空行\n",
" while lines and not lines[0].strip():\n",
" lines.pop(0)\n",
" while lines and not lines[-1].strip():\n",
" lines.pop()\n",
"\n",
" out = []\n",
" for ln in lines:\n",
" # 压缩行内空白\n",
" ln2 = re.sub(r\"[ \\t]+\", \" \", ln).strip()\n",
" out.append(ln2)\n",
"\n",
" # 合并连续空行(最多保留 1 行空行)\n",
" merged = []\n",
" blank = False\n",
" for ln in out:\n",
" if not ln:\n",
" if not blank:\n",
" merged.append(\"\")\n",
" blank = True\n",
" else:\n",
" merged.append(ln)\n",
" blank = False\n",
"\n",
" return \"\\n\".join(merged).strip()\n",
"\n",
"\n",
"class PrettyWriter:\n",
" \"\"\"\n",
" 输出效果:\n",
" - 标题:标题 + 分隔线\n",
" - 段落:段落前后空一行\n",
" - 表格:按行输出,表格前后适当空行\n",
" - 图片 OCR图块插入且 OCR 只写 content\n",
" \"\"\"\n",
" def __init__(self):\n",
" self.lines: List[str] = []\n",
"\n",
" def _ensure_blank(self, n: int = 1):\n",
" # 末尾空行数\n",
" cnt = 0\n",
" i = len(self.lines) - 1\n",
" while i >= 0 and self.lines[i] == \"\":\n",
" cnt += 1\n",
" i -= 1\n",
" for _ in range(max(0, n - cnt)):\n",
" self.lines.append(\"\")\n",
"\n",
" def add_heading(self, title: str):\n",
" title = _clean_text_block(title)\n",
" if not title:\n",
" return\n",
" self._ensure_blank(1)\n",
" self.lines.append(title)\n",
" self.lines.append(\"-\" * max(10, min(80, len(title) * 2)))\n",
" self._ensure_blank(1)\n",
"\n",
" def add_paragraph(self, text: str):\n",
" text = _clean_text_block(text)\n",
" if not text:\n",
" return\n",
" self._ensure_blank(1)\n",
" self.lines.extend(text.splitlines())\n",
" self._ensure_blank(1)\n",
"\n",
" def add_table_rowline(self, text: str):\n",
" text = _clean_text_block(text)\n",
" if not text:\n",
" return\n",
" self.lines.append(text)\n",
"\n",
" def add_table_block(self, title: Optional[str], rows: List[str]):\n",
" rows = [r for r in rows if (r or \"\").strip()]\n",
" if not rows:\n",
" return\n",
" if title:\n",
" self.add_heading(title)\n",
" else:\n",
" self._ensure_blank(1)\n",
"\n",
" for r in rows:\n",
" self.add_table_rowline(r)\n",
" self._ensure_blank(1)\n",
"\n",
" def add_figure_content(self, fig_no: int, content: Optional[str], note: Optional[str] = None):\n",
" \"\"\"\n",
" 图片 OCR 图块:只输出 content\n",
" \"\"\"\n",
" self._ensure_blank(1)\n",
" header = f\"【图{fig_no:04d}】\"\n",
" if note:\n",
" note = _clean_text_block(note)\n",
" if note:\n",
" header += f\" {note}\"\n",
" self.lines.append(header)\n",
"\n",
" c = _clean_text_block(content or \"\")\n",
" if c:\n",
" self.lines.extend(c.splitlines())\n",
" else:\n",
" self.lines.append(\"OCR 失败/未配置AKSK/或 content 为空)\")\n",
"\n",
" self._ensure_blank(1)\n",
"\n",
" def to_text(self) -> str:\n",
" while self.lines and self.lines[-1] == \"\":\n",
" self.lines.pop()\n",
" return \"\\n\".join(self.lines) + \"\\n\"\n",
"\n",
"\n",
"# =========================\n",
"# OCR图片标准化 + Alibaba OCR只输出 content\n",
"# =========================\n",
"MIN_BYTES = 200\n",
"MAX_SIDE = 3000\n",
"MIN_FILE_SIZE = 5 * 1024 # 5KB\n",
"\n",
"\n",
"def ocr_data_to_content_only(data: Any) -> str:\n",
" \"\"\"\n",
" 只返回 OCR 结果里的 Content/content 字段(兼容大小写)。\n",
" 兼容Tea SDK 模型对象 / dict / JSON字符串 / Python dict字符串(str(dict))\n",
" \"\"\"\n",
" if data is None:\n",
" return \"\"\n",
"\n",
" # 1) Tea SDK Model优先用 to_map() 转 dict\n",
" # RecognizeAllTextResponseBodyData 通常有 to_map()\n",
" try:\n",
" if hasattr(data, \"to_map\") and callable(getattr(data, \"to_map\")):\n",
" data = data.to_map()\n",
" except Exception:\n",
" pass\n",
"\n",
" # 2) data 是字符串:可能是 JSON也可能是 \"{'Content': '...'}\"\n",
" if isinstance(data, str):\n",
" s = data.strip()\n",
" if not s:\n",
" return \"\"\n",
" if (s.startswith(\"{\") and s.endswith(\"}\")) or (s.startswith(\"[\") and s.endswith(\"]\")):\n",
" try:\n",
" data = json.loads(s)\n",
" except Exception:\n",
" try:\n",
" data = ast.literal_eval(s) # 安全:不执行代码\n",
" except Exception:\n",
" return \"\"\n",
" else:\n",
" return \"\"\n",
"\n",
" def pick_content(d: Dict[str, Any]) -> str:\n",
" for k in (\"Content\", \"content\"):\n",
" v = d.get(k)\n",
" if isinstance(v, str) and v.strip():\n",
" return v.strip()\n",
" return \"\"\n",
"\n",
" # 3) dict优先顶层 Content/content然后尝试常见嵌套\n",
" if isinstance(data, dict):\n",
" got = pick_content(data)\n",
" if got:\n",
" return got\n",
"\n",
" for nest_key in (\"Data\", \"data\", \"Result\", \"result\", \"Body\", \"body\"):\n",
" nest = data.get(nest_key)\n",
" # 嵌套也可能是 Tea Model\n",
" try:\n",
" if hasattr(nest, \"to_map\") and callable(getattr(nest, \"to_map\")):\n",
" nest = nest.to_map()\n",
" except Exception:\n",
" pass\n",
"\n",
" if isinstance(nest, dict):\n",
" got2 = pick_content(nest)\n",
" if got2:\n",
" return got2\n",
" return \"\"\n",
"\n",
" # 4) list找第一个 dict 里的 Content/content\n",
" if isinstance(data, list):\n",
" for it in data:\n",
" try:\n",
" if hasattr(it, \"to_map\") and callable(getattr(it, \"to_map\")):\n",
" it = it.to_map()\n",
" except Exception:\n",
" pass\n",
" if isinstance(it, dict):\n",
" got = pick_content(it)\n",
" if got:\n",
" return got\n",
" return \"\"\n",
"\n",
" return \"\"\n",
"\n",
"\n",
"class AlibabaOCR:\n",
" def __init__(self, endpoint: str = \"ocr-api.cn-hangzhou.aliyuncs.com\"):\n",
" self.endpoint = endpoint\n",
" self.client = self._try_create_client()\n",
"\n",
" def _try_create_client(self) -> Optional[ocr_api20210707Client]:\n",
" ak = os.environ.get(\"ALIBABA_CLOUD_ACCESS_KEY_ID\")\n",
" sk = os.environ.get(\"ALIBABA_CLOUD_ACCESS_KEY_SECRET\")\n",
" if not ak or not sk:\n",
" return None\n",
" config = open_api_models.Config(access_key_id=ak, access_key_secret=sk)\n",
" config.endpoint = self.endpoint\n",
" return ocr_api20210707Client(config)\n",
"\n",
" @staticmethod\n",
" def normalize_image_to_jpeg_bytes(image_bytes: bytes) -> Optional[bytes]:\n",
" \"\"\"\n",
" 统一成 OCR 友好的 JPEG bytes\n",
" - 自动纠正方向\n",
" - 转 RGB\n",
" - 过大就缩放\n",
" \"\"\"\n",
" try:\n",
" with Image.open(BytesIO(image_bytes)) as img:\n",
" img = ImageOps.exif_transpose(img)\n",
" if img.mode != \"RGB\":\n",
" img = img.convert(\"RGB\")\n",
"\n",
" w, h = img.size\n",
" m = max(w, h)\n",
" if m > MAX_SIDE:\n",
" scale = MAX_SIDE / float(m)\n",
" img = img.resize((int(w * scale), int(h * scale)))\n",
"\n",
" buf = BytesIO()\n",
" img.save(buf, format=\"JPEG\", quality=90, optimize=True)\n",
" data = buf.getvalue()\n",
" if len(data) < MIN_BYTES:\n",
" return None\n",
" return data\n",
" except Exception:\n",
" return None\n",
"\n",
" def ocr_bytes_content(self, image_jpeg_bytes: bytes, ocr_type: str = \"General\") -> Optional[str]:\n",
" \"\"\"\n",
" 只输出 content 字段。\n",
" \"\"\"\n",
" if not self.client:\n",
" return None\n",
" req = ocr_api_20210707_models.RecognizeAllTextRequest(body=image_jpeg_bytes, type=ocr_type)\n",
" runtime = util_models.RuntimeOptions()\n",
" try:\n",
" resp = self.client.recognize_all_text_with_options(req, runtime)\n",
" data = getattr(resp.body, \"data\", None)\n",
" # print(\"DEBUG type(data):\", type(data))\n",
" # print(\"DEBUG data keys:\", list(data.keys()) if isinstance(data, dict) else str(data)[:200])\n",
" \n",
" content = ocr_data_to_content_only(data)\n",
" content = (content or \"\").strip()\n",
" return content if content else None\n",
" except Exception:\n",
" return None\n",
"\n",
"\n",
"# =========================\n",
"# DOCX按内容顺序抽取 + 图片 OCR插入图块\n",
"# =========================\n",
"def _iter_docx_block_items(parent) -> List[Any]:\n",
" \"\"\"\n",
" 按 body 顺序返回 Paragraph / Table\n",
" parent 可以是 DocxDocument 或 _Cell\n",
" \"\"\"\n",
" if isinstance(parent, DocxDocument):\n",
" parent_elm = parent.element.body\n",
" else:\n",
" parent_elm = parent._tc # _Cell\n",
"\n",
" out = []\n",
" for child in parent_elm.iterchildren():\n",
" if isinstance(child, CT_P):\n",
" out.append(Paragraph(child, parent))\n",
" elif isinstance(child, CT_Tbl):\n",
" out.append(Table(child, parent))\n",
" return out\n",
"\n",
"\n",
"def _docx_paragraph_text(p: Paragraph) -> str:\n",
" return (p.text or \"\").strip()\n",
"\n",
"\n",
"def _docx_table_text(tbl: Table) -> List[str]:\n",
" lines = []\n",
" for row in tbl.rows:\n",
" row_text = []\n",
" for cell in row.cells:\n",
" cell_t = (cell.text or \"\").strip()\n",
" if cell_t:\n",
" row_text.append(cell_t)\n",
" if row_text:\n",
" lines.append(\"\\t\".join(row_text))\n",
" return lines\n",
"\n",
"\n",
"def _docx_find_image_rids_in_paragraph(p: Paragraph) -> List[str]:\n",
" \"\"\"\n",
" 不用 namespaces 参数,兼容 python-docx 的 xpath 实现。\n",
" \"\"\"\n",
" rids: List[str] = []\n",
" blips = p._p.xpath('.//*[local-name()=\"blip\"]')\n",
" for blip in blips:\n",
" rid = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed') or \\\n",
" blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}link')\n",
" if rid:\n",
" rids.append(rid)\n",
" return rids\n",
"\n",
"\n",
"def extract_docx_in_order(docx_path: Path, ocr: AlibabaOCR, save_images_dir: Optional[Path] = None) -> str:\n",
" doc = docx_open(str(docx_path))\n",
" rels = doc.part.rels\n",
"\n",
" w = PrettyWriter()\n",
" w.add_heading(f\"文档:{docx_path.name}\")\n",
"\n",
" img_counter = 0\n",
" table_counter = 0\n",
"\n",
" def handle_paragraph(p: Paragraph):\n",
" nonlocal img_counter\n",
" t = _docx_paragraph_text(p)\n",
" if t:\n",
" w.add_paragraph(t)\n",
"\n",
" rids = _docx_find_image_rids_in_paragraph(p)\n",
" for rid in rids:\n",
" rel = rels.get(rid)\n",
" if not rel:\n",
" continue\n",
"\n",
" part = rel.target_part\n",
" blob = getattr(part, \"blob\", None)\n",
" if not blob or len(blob) < MIN_FILE_SIZE:\n",
" continue\n",
"\n",
" img_counter += 1\n",
" content_type = getattr(part, \"content_type\", \"\")\n",
" ext = guess_ext_from_content_type(content_type)\n",
"\n",
" if save_images_dir:\n",
" ensure_dir(save_images_dir)\n",
" (save_images_dir / f\"img_{img_counter:04d}{ext}\").write_bytes(blob)\n",
"\n",
" norm = ocr.normalize_image_to_jpeg_bytes(blob)\n",
" if not norm or len(norm) < MIN_FILE_SIZE:\n",
" w.add_figure_content(img_counter, None, note=\"(跳过:规范化后太小/无效)\")\n",
" continue\n",
"\n",
" content = ocr.ocr_bytes_content(norm, ocr_type=\"General\")\n",
" w.add_figure_content(img_counter, content)\n",
"\n",
" def handle_table(tbl: Table):\n",
" nonlocal table_counter\n",
" table_counter += 1\n",
" rows = _docx_table_text(tbl)\n",
" # 更像最终稿:每个表格作为一个 block\n",
" w.add_table_block(f\"表格 {table_counter}\", rows)\n",
"\n",
" # 处理表格单元格内部的段落/图片(按单元格内部顺序)\n",
" for row in tbl.rows:\n",
" for cell in row.cells:\n",
" cell: _Cell\n",
" for item in _iter_docx_block_items(cell):\n",
" if isinstance(item, Paragraph):\n",
" handle_paragraph(item)\n",
" elif isinstance(item, Table):\n",
" handle_table(item)\n",
"\n",
" for item in _iter_docx_block_items(doc):\n",
" if isinstance(item, Paragraph):\n",
" handle_paragraph(item)\n",
" elif isinstance(item, Table):\n",
" handle_table(item)\n",
"\n",
" return w.to_text()\n",
"\n",
"\n",
"# =========================\n",
"# PPTX按坐标阅读顺序排序 shapes + 图片 OCR插入图块\n",
"# =========================\n",
"def _shape_sort_key(shape, y_tol_emu: int = 20000) -> Tuple[int, int, int]:\n",
" try:\n",
" top = int(getattr(shape, \"top\", 0))\n",
" except Exception:\n",
" top = 0\n",
" try:\n",
" left = int(getattr(shape, \"left\", 0))\n",
" except Exception:\n",
" left = 0\n",
" top_bucket = (top // y_tol_emu) * y_tol_emu\n",
" sid = int(getattr(shape, \"shape_id\", 0))\n",
" return (top_bucket, left, sid)\n",
"\n",
"\n",
"def extract_pptx_in_order(pptx_path: Path, ocr: AlibabaOCR, save_images_dir: Optional[Path] = None) -> str:\n",
" prs = Presentation(str(pptx_path))\n",
" w = PrettyWriter()\n",
" w.add_heading(f\"演示文稿:{pptx_path.name}\")\n",
"\n",
" img_counter = 0\n",
"\n",
" for s_idx, slide in enumerate(prs.slides, 1):\n",
" w.add_heading(f\"第 {s_idx} 页Slide {s_idx}\")\n",
"\n",
" shapes = list(slide.shapes)\n",
" shapes.sort(key=_shape_sort_key)\n",
"\n",
" for shape in shapes:\n",
" # 文本\n",
" if hasattr(shape, \"text\"):\n",
" t = (shape.text or \"\").strip()\n",
" if t:\n",
" w.add_paragraph(t)\n",
"\n",
" # 图片shape_type=13\n",
" if getattr(shape, \"shape_type\", None) == 13:\n",
" try:\n",
" img = shape.image\n",
" blob = img.blob\n",
" if not blob or len(blob) < MIN_FILE_SIZE:\n",
" continue\n",
"\n",
" img_counter += 1\n",
" ext = (img.ext or \"png\").lower()\n",
" if ext == \"jpeg\":\n",
" ext = \"jpg\"\n",
"\n",
" if save_images_dir:\n",
" ensure_dir(save_images_dir)\n",
" (save_images_dir / f\"slide{s_idx:02d}_img_{img_counter:04d}.{ext}\").write_bytes(blob)\n",
"\n",
" norm = ocr.normalize_image_to_jpeg_bytes(blob)\n",
" if not norm or len(norm) < MIN_FILE_SIZE:\n",
" w.add_figure_content(img_counter, None, note=\"(跳过:规范化后太小/无效)\")\n",
" continue\n",
"\n",
" content = ocr.ocr_bytes_content(norm, ocr_type=\"General\")\n",
" w.add_figure_content(img_counter, content)\n",
"\n",
" except Exception:\n",
" continue\n",
"\n",
" return w.to_text()\n",
"\n",
"\n",
"# =========================\n",
"# PDF按页面 block 坐标排序输出(文本/图片混排)+ 图片 OCR插入图块\n",
"# =========================\n",
"def _pdf_blocks_in_reading_order(page_dict: Dict[str, Any]) -> List[Dict[str, Any]]:\n",
" blocks = page_dict.get(\"blocks\", []) or []\n",
"\n",
" def key_fn(b):\n",
" bbox = b.get(\"bbox\") or [0, 0, 0, 0]\n",
" return (round(bbox[1], 3), round(bbox[0], 3)) # y0, x0\n",
"\n",
" return sorted(blocks, key=key_fn)\n",
"\n",
"\n",
"def _pdf_text_from_block(block: Dict[str, Any]) -> str:\n",
" lines = []\n",
" for line in block.get(\"lines\", []) or []:\n",
" spans = line.get(\"spans\", []) or []\n",
" txt = \"\".join((s.get(\"text\") or \"\") for s in spans).strip()\n",
" if txt:\n",
" lines.append(txt)\n",
" return \"\\n\".join(lines).strip()\n",
"\n",
"\n",
"def extract_pdf_in_order(pdf_path: Path, ocr: AlibabaOCR, save_images_dir: Optional[Path] = None) -> str:\n",
" doc = fitz.open(str(pdf_path))\n",
" w = PrettyWriter()\n",
" w.add_heading(f\"PDF{pdf_path.name}\")\n",
"\n",
" img_counter = 0\n",
"\n",
" for p_idx in range(len(doc)):\n",
" page = doc[p_idx]\n",
" w.add_heading(f\"第 {p_idx + 1} 页Page {p_idx + 1}\")\n",
"\n",
" d = page.get_text(\"dict\")\n",
" blocks = _pdf_blocks_in_reading_order(d)\n",
"\n",
" for b in blocks:\n",
" btype = b.get(\"type\")\n",
" if btype == 0:\n",
" t = _pdf_text_from_block(b)\n",
" if t:\n",
" w.add_paragraph(t)\n",
"\n",
" elif btype == 1:\n",
" xref = b.get(\"xref\")\n",
" if not xref:\n",
" continue\n",
" try:\n",
" base = doc.extract_image(xref)\n",
" except Exception:\n",
" continue\n",
" if not base:\n",
" continue\n",
"\n",
" blob = base.get(\"image\")\n",
" ext = (base.get(\"ext\") or \"png\").lower()\n",
" if ext == \"jpeg\":\n",
" ext = \"jpg\"\n",
"\n",
" if not blob or len(blob) < MIN_FILE_SIZE:\n",
" continue\n",
"\n",
" img_counter += 1\n",
"\n",
" if save_images_dir:\n",
" ensure_dir(save_images_dir)\n",
" (save_images_dir / f\"page{p_idx+1:03d}_img_{img_counter:04d}.{ext}\").write_bytes(blob)\n",
"\n",
" norm = ocr.normalize_image_to_jpeg_bytes(blob)\n",
" if not norm or len(norm) < MIN_FILE_SIZE:\n",
" w.add_figure_content(img_counter, None, note=\"(跳过:规范化后太小/无效)\")\n",
" continue\n",
"\n",
" content = ocr.ocr_bytes_content(norm, ocr_type=\"General\")\n",
" w.add_figure_content(img_counter, content)\n",
"\n",
" doc.close()\n",
" return w.to_text()\n",
"\n",
"\n",
"# =========================\n",
"# 单文件主入口:按内容顺序输出到 txt\n",
"# =========================\n",
"def extract_single_file_to_txt(\n",
" file_path: str,\n",
" output_dir: str = \"output_single\",\n",
" save_images: bool = False,\n",
") -> str:\n",
" p = Path(file_path)\n",
" if not p.exists() or not p.is_file():\n",
" raise RuntimeError(f\"文件不存在:{file_path}\")\n",
"\n",
" ext = p.suffix.lower()\n",
" if ext not in (\".pdf\", \".docx\", \".pptx\"):\n",
" raise RuntimeError(f\"不支持的文件类型:{ext}(仅支持 pdf/docx/pptx\")\n",
"\n",
" out_root = Path(output_dir)\n",
" ensure_dir(out_root)\n",
"\n",
" base = safe_name(p.stem)\n",
" out_txt = out_root / f\"{base}{ext}.txt\"\n",
"\n",
" img_dir = None\n",
" if save_images:\n",
" img_dir = out_root / f\"{base}_images\"\n",
" ensure_dir(img_dir)\n",
"\n",
" ocr = AlibabaOCR()\n",
"\n",
" if ext == \".docx\":\n",
" content = extract_docx_in_order(p, ocr=ocr, save_images_dir=img_dir)\n",
" elif ext == \".pptx\":\n",
" content = extract_pptx_in_order(p, ocr=ocr, save_images_dir=img_dir)\n",
" else:\n",
" content = extract_pdf_in_order(p, ocr=ocr, save_images_dir=img_dir)\n",
"\n",
" out_txt.write_text(content or \"\", encoding=\"utf-8\")\n",
" return str(out_txt.resolve())\n",
"\n",
"\n",
"\n",
"def extract_folder_to_txts(\n",
" input_dir: str,\n",
" output_dir: str = \"output_single\",\n",
" save_images: Union[bool, str] = False,\n",
") -> List[str]:\n",
" in_dir = Path(input_dir)\n",
" if not in_dir.exists() or not in_dir.is_dir():\n",
" raise RuntimeError(f\"输入目录不存在或不是文件夹:{input_dir}\")\n",
"\n",
" out_paths: List[str] = []\n",
" files = sorted([p for p in in_dir.rglob(\"*\") if p.is_file() and p.suffix.lower() in (\".docx\", \".pdf\", \".pptx\")])\n",
"\n",
" for fp in files:\n",
" try:\n",
" out_txt = extract_single_file_to_txt(str(fp), output_dir=output_dir, save_images=save_images)\n",
" out_paths.append(out_txt)\n",
" print(f\"[OK] {fp} -> {out_txt}\")\n",
" except Exception as e:\n",
" print(f\"[FAIL] {fp} -> {e}\")\n",
"\n",
" return out_paths\n",
"\n",
"\n",
"# if __name__ == \"__main__\":\n",
"# if len(sys.argv) < 2:\n",
"# print(\"用法python extract_one.py <file_path> [output_dir] [save_images(0/1)]\")\n",
"# print(\"示例python extract_one.py ./a.pdf ./out 1\")\n",
"# sys.exit(1)\n",
"\n",
"# file_path = sys.argv[1]\n",
"# output_dir = sys.argv[2] if len(sys.argv) >= 3 else \"output_single\"\n",
"# save_images = bool(int(sys.argv[3])) if len(sys.argv) >= 4 else False\n",
"\n",
"# out_txt = extract_single_file_to_txt(file_path, output_dir=output_dir, save_images=save_images)\n",
"# print(f\"完成。输出文件:{out_txt}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "b529882f-38ab-4cce-b92c-df5c1f6071fd",
"metadata": {},
"outputs": [],
"source": [
"file_path = \"/data/docs/c0e7523d3b5e27c903eb8748a475520c.docx\" \n",
"output_dir = \"/data/output/\"\n",
"save_images = \"/data/output/image\""
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "357a90f9-adb1-4d41-938b-66515d1a1336",
"metadata": {},
"outputs": [],
"source": [
"# os.environ[\"ALIBABA_CLOUD_ACCESS_KEY_ID\"] = \"LTAI5t8vdvFnes9cXbcPrcby\"\n",
"# os.environ[\"ALIBABA_CLOUD_ACCESS_KEY_SECRET\"] = \"gHnnFli9iV3MPxFvdEGTZbKlwfZW0e\"\n",
"os.environ[\"ALIBABA_CLOUD_ACCESS_KEY_ID\"] = \"LTAI5t97pmvKirm9n6ygCHyY\"\n",
"os.environ[\"ALIBABA_CLOUD_ACCESS_KEY_SECRET\"] = \"sCFIbuEyf8RUPT1C9kS9CRxHzc7WAl\""
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "8014a00f-4c96-47f3-9793-79728f4e4b9d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[OK] /data/投喂文献/核心文献/《国际近视研究院近视防控白皮书Ⅲ》解读.pdf -> /data/result/核心文献/《国际近视研究院近视防控白皮书Ⅲ》解读.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/《近视防治指南2024年版》专家解读.pdf -> /data/result/核心文献/《近视防治指南2024年版》专家解读.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/中国儿童弱视防治专家共识2021年.pdf -> /data/result/核心文献/中国儿童弱视防治专家共识2021年.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/中国儿童睫状肌麻痹验光及安全用药专家共识2019年.pdf -> /data/result/核心文献/中国儿童睫状肌麻痹验光及安全用药专家共识2019年.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/中国儿童青少年近视防控公共卫生综合干预行动专家共识.pdf -> /data/result/核心文献/中国儿童青少年近视防控公共卫生综合干预行动专家共识.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/中国多点微透镜近视离焦设计镜片框架眼镜近视控制效果评价和安全配戴专家共识2025年.pdf -> /data/result/核心文献/中国多点微透镜近视离焦设计镜片框架眼镜近视控制效果评价和安全配戴专家共识2025年.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/中国婴幼儿视力评估专家共识2023年.pdf -> /data/result/核心文献/中国婴幼儿视力评估专家共识2023年.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/中国学龄儿童眼球远视储备、眼轴长度、角膜曲率参考区间及相关遗传因素专家共识2022年.pdf -> /data/result/核心文献/中国学龄儿童眼球远视储备、眼轴长度、角膜曲率参考区间及相关遗传因素专家共识2022年.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/中国接触镜相关性干眼诊疗专家共识2024年.pdf -> /data/result/核心文献/中国接触镜相关性干眼诊疗专家共识2024年.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/中国激光角膜屈光手术术前验光及单眼视模拟专家共识(2023).pdf -> /data/result/核心文献/中国激光角膜屈光手术术前验光及单眼视模拟专家共识(2023).pdf.txt\n",
"[OK] /data/投喂文献/核心文献/中国视疲劳诊疗专家共识2024年.pdf -> /data/result/核心文献/中国视疲劳诊疗专家共识2024年.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/中国近视前期管理专家共识2025年.pdf -> /data/result/核心文献/中国近视前期管理专家共识2025年.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/中学生近视眼预防的干预效果评价.pdf -> /data/result/核心文献/中学生近视眼预防的干预效果评价.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/亚洲近视管理共识.pdf -> /data/result/核心文献/亚洲近视管理共识.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/低浓度阿托品滴眼液在儿童青少年近视防控中的应用专家共识2024.pdf -> /data/result/核心文献/低浓度阿托品滴眼液在儿童青少年近视防控中的应用专家共识2024.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/体检人群眼健康筛查及健康管理专家共识2024.pdf -> /data/result/核心文献/体检人群眼健康筛查及健康管理专家共识2024.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/儿童屈光矫正专家共识2017.pdf -> /data/result/核心文献/儿童屈光矫正专家共识2017.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/儿童青少年近视中西医结合诊疗指南.pdf -> /data/result/核心文献/儿童青少年近视中西医结合诊疗指南.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/儿童青少年近视普查中检测设备和设置标准化专家共识2019.pdf -> /data/result/核心文献/儿童青少年近视普查中检测设备和设置标准化专家共识2019.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/儿童青少年近视普查信息化管理专家共识2019.pdf -> /data/result/核心文献/儿童青少年近视普查信息化管理专家共识2019.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/儿童青少年近视普查工作流程专家共识2019.pdf -> /data/result/核心文献/儿童青少年近视普查工作流程专家共识2019.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/儿童青少年近视防控户外活动中的强光防护专家意见.pdf -> /data/result/核心文献/儿童青少年近视防控户外活动中的强光防护专家意见.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/关于加强儿童青少年近视防控用眼行为干预的倡议及实施方法共识(2023).pdf -> /data/result/核心文献/关于加强儿童青少年近视防控用眼行为干预的倡议及实施方法共识(2023).pdf.txt\n",
"[OK] /data/投喂文献/核心文献/周边眼轴在近视防控中的应用进展.pdf -> /data/result/核心文献/周边眼轴在近视防控中的应用进展.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/国际近视研究院_近视发生与进展的防控干预措施(2025).pdf -> /data/result/核心文献/国际近视研究院_近视发生与进展的防控干预措施(2025).pdf.txt\n",
"[OK] /data/投喂文献/核心文献/国际近视研究院———近视管理仪器(2025).pdf -> /data/result/核心文献/国际近视研究院———近视管理仪器(2025).pdf.txt\n",
"[OK] /data/投喂文献/核心文献/基于Gabor视标的视知觉学习治疗难治性弱视的临床疗效观察.pdf -> /data/result/核心文献/基于Gabor视标的视知觉学习治疗难治性弱视的临床疗效观察.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/基于智能化设备的知觉学习训练在周边视野缺损的视觉康复中的应用.pdf -> /data/result/核心文献/基于智能化设备的知觉学习训练在周边视野缺损的视觉康复中的应用.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/基于虚拟现实技术的近视防控策略与效果.pdf -> /data/result/核心文献/基于虚拟现实技术的近视防控策略与效果.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/多元视力训练对初中生视力影响的实证研究.pdf -> /data/result/核心文献/多元视力训练对初中生视力影响的实证研究.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/客观评价眼轴长度的概念和测量方法.pdf -> /data/result/核心文献/客观评价眼轴长度的概念和测量方法.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/对比敏感度检测方法及其在弱视诊疗中的应用.pdf -> /data/result/核心文献/对比敏感度检测方法及其在弱视诊疗中的应用.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/小学生近视筛查常用方法的应用价值评价.pdf -> /data/result/核心文献/小学生近视筛查常用方法的应用价值评价.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/屈光参差对对比敏感度的影响.pdf -> /data/result/核心文献/屈光参差对对比敏感度的影响.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/年龄相关视功能和眼健康管理白皮书.pdf -> /data/result/核心文献/年龄相关视功能和眼健康管理白皮书.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/应用于近视控制的多焦软镜验配专家共识2023.pdf -> /data/result/核心文献/应用于近视控制的多焦软镜验配专家共识2023.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/弱视治疗进展 .pdf -> /data/result/核心文献/弱视治疗进展.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/弱视的定义、分类及疗效评价标准.pdf -> /data/result/核心文献/弱视的定义、分类及疗效评价标准.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/弱视眼的对比敏感度.pdf -> /data/result/核心文献/弱视眼的对比敏感度.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/快速对比敏感度函数的双眼总和评估弱视阈值水平双眼功能的效果.pdf -> /data/result/核心文献/快速对比敏感度函数的双眼总和评估弱视阈值水平双眼功能的效果.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/我国斜视分类专家共识2015年.pdf -> /data/result/核心文献/我国斜视分类专家共识2015年.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/斜视的分类.pdf -> /data/result/核心文献/斜视的分类.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/斜视相关术语的英文缩写规范.pdf -> /data/result/核心文献/斜视相关术语的英文缩写规范.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/理视健眼操及其解剖学基础.pdf -> /data/result/核心文献/理视健眼操及其解剖学基础.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/眼轴检测在儿童青少年近视防控中的价值.pdf -> /data/result/核心文献/眼轴检测在儿童青少年近视防控中的价值.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/眼轴的长度与近视.pdf -> /data/result/核心文献/眼轴的长度与近视.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/眼轴长度在近视防控管理中的应用专家共识(2023).pdf -> /data/result/核心文献/眼轴长度在近视防控管理中的应用专家共识(2023).pdf.txt\n",
"[OK] /data/投喂文献/核心文献/知觉学习改善屈光参差性弱视视功能的临床观察.pdf -> /data/result/核心文献/知觉学习改善屈光参差性弱视视功能的临床观察.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/知觉学习训练在弱视治疗中的应用.pdf -> /data/result/核心文献/知觉学习训练在弱视治疗中的应用.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/知觉学习训练重建共同性外斜视术后立体视功能的作用.pdf -> /data/result/核心文献/知觉学习训练重建共同性外斜视术后立体视功能的作用.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/自适应光学矫正高阶像差结合视知觉学习对难治性弱视的治疗效果.pdf -> /data/result/核心文献/自适应光学矫正高阶像差结合视知觉学习对难治性弱视的治疗效果.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/规范斜视的诊断和治疗解读美国眼科学会内斜视和外斜视2017年版临床指南.pdf -> /data/result/核心文献/规范斜视的诊断和治疗解读美国眼科学会内斜视和外斜视2017年版临床指南.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/视力健康与《国家学生体质健康标准》项目指标相关性研究.pdf -> /data/result/核心文献/视力健康与《国家学生体质健康标准》项目指标相关性研究.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/视知觉优势在眼科临床应用中的研究进展.pdf -> /data/result/核心文献/视知觉优势在眼科临床应用中的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/视知觉学习与注意力的关系.pdf -> /data/result/核心文献/视知觉学习与注意力的关系.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/视知觉学习对早期老视症状的改善作用.pdf -> /data/result/核心文献/视知觉学习对早期老视症状的改善作用.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/视知觉学习治疗弱视患儿依从性及其影响因素的调查分析.pdf -> /data/result/核心文献/视知觉学习治疗弱视患儿依从性及其影响因素的调查分析.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/视知觉学习治疗近视性弱视屈光度的观察及护理.pdf -> /data/result/核心文献/视知觉学习治疗近视性弱视屈光度的观察及护理.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/视知觉学习疗法对先天性青光眼患儿术后眼视力恢复的效果.pdf -> /data/result/核心文献/视知觉学习疗法对先天性青光眼患儿术后眼视力恢复的效果.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/视知觉学习疗法治疗学龄前儿童屈光不正性弱视的效果.pdf -> /data/result/核心文献/视知觉学习疗法治疗学龄前儿童屈光不正性弱视的效果.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/视知觉学习的发展与运用.pdf -> /data/result/核心文献/视知觉学习的发展与运用.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/视知觉学习的时间进程及其与睡眠的关系.pdf -> /data/result/核心文献/视知觉学习的时间进程及其与睡眠的关系.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/视网膜对比度信号对近视控制作用的研究进展.pdf -> /data/result/核心文献/视网膜对比度信号对近视控制作用的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/视觉皮质可塑性及双眼视觉与弱视治疗新理念.pdf -> /data/result/核心文献/视觉皮质可塑性及双眼视觉与弱视治疗新理念.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/视觉训练治疗近视伴调节不足的临床效果探讨.pdf -> /data/result/核心文献/视觉训练治疗近视伴调节不足的临床效果探讨.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/角膜塑形镜和脑视觉神经可塑在减缓近视进展中的协同作用研究.pdf -> /data/result/核心文献/角膜塑形镜和脑视觉神经可塑在减缓近视进展中的协同作用研究.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/轴率比诊断学龄期儿童近视的准确性评估.pdf -> /data/result/核心文献/轴率比诊断学龄期儿童近视的准确性评估.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/近视病因与发病机制的研究进展.pdf -> /data/result/核心文献/近视病因与发病机制的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/近视的脑视觉研究现状及展望.pdf -> /data/result/核心文献/近视的脑视觉研究现状及展望.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/近视眼的发生机制.pdf -> /data/result/核心文献/近视眼的发生机制.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/近视矫正、近视控制与近视管理_术语定义与推荐使用方式.pdf -> /data/result/核心文献/近视矫正、近视控制与近视管理_术语定义与推荐使用方式.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/近视管理白皮书2019.pdf -> /data/result/核心文献/近视管理白皮书2019.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/近视管理白皮书2025.pdf -> /data/result/核心文献/近视管理白皮书2025.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/近视防控相关框架眼镜在近视管理中的应用专家共识2023.pdf -> /data/result/核心文献/近视防控相关框架眼镜在近视管理中的应用专家共识2023.pdf.txt\n",
"[OK] /data/投喂文献/核心文献/重视高度近视防控的专家共识2017.pdf -> /data/result/核心文献/重视高度近视防控的专家共识2017.pdf.txt\n",
"生成txt数量 75\n",
"[OK] /data/投喂文献/科普/不同健康教育方式对儿童青少年近视防控作用的研究进展.pdf -> /data/result/科普/不同健康教育方式对儿童青少年近视防控作用的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/科普/中小学校医与保健教师近视防控核心胜任力评价量表的构建研究.pdf -> /data/result/科普/中小学校医与保健教师近视防控核心胜任力评价量表的构建研究.pdf.txt\n",
"[OK] /data/投喂文献/科普/互联网竟然能为近视防控做这么多.pdf -> /data/result/科普/互联网竟然能为近视防控做这么多.pdf.txt\n",
"[OK] /data/投喂文献/科普/从生活习惯到科技辅助综合防控近视.pdf -> /data/result/科普/从生活习惯到科技辅助综合防控近视.pdf.txt\n",
"[OK] /data/投喂文献/科普/以团队为基础学习的翻转课堂在青少年近视防控中的应用效果.pdf -> /data/result/科普/以团队为基础学习的翻转课堂在青少年近视防控中的应用效果.pdf.txt\n",
"[OK] /data/投喂文献/科普/例谈如何利用体育活动加强小学生近视防控.pdf -> /data/result/科普/例谈如何利用体育活动加强小学生近视防控.pdf.txt\n",
"[OK] /data/投喂文献/科普/做好儿童近视防控,拒当_小眼镜_.pdf -> /data/result/科普/做好儿童近视防控,拒当_小眼镜_.pdf.txt\n",
"[OK] /data/投喂文献/科普/儿童青少年身体活动与近视的关系_系统综述和Meta分析.pdf -> /data/result/科普/儿童青少年身体活动与近视的关系_系统综述和Meta分析.pdf.txt\n",
"[OK] /data/投喂文献/科普/儿童青少年近视防控健康宣教工作的探讨.pdf -> /data/result/科普/儿童青少年近视防控健康宣教工作的探讨.pdf.txt\n",
"[OK] /data/投喂文献/科普/基于天人同律从_脑目同调_探讨儿童青少年近视防控.pdf -> /data/result/科普/基于天人同律从_脑目同调_探讨儿童青少年近视防控.pdf.txt\n",
"[OK] /data/投喂文献/科普/多方合作做好幼儿近视防控工作.pdf -> /data/result/科普/多方合作做好幼儿近视防控工作.pdf.txt\n",
"[OK] /data/投喂文献/科普/如何做好儿童青少年近视防控.pdf -> /data/result/科普/如何做好儿童青少年近视防控.pdf.txt\n",
"[OK] /data/投喂文献/科普/学生近视防控相关知识态度行为现状.pdf -> /data/result/科普/学生近视防控相关知识态度行为现状.pdf.txt\n",
"[OK] /data/投喂文献/科普/学生近视防控知识知多少.pdf -> /data/result/科普/学生近视防控知识知多少.pdf.txt\n",
"[OK] /data/投喂文献/科普/学生近视,真的和座位有关吗?.pdf -> /data/result/科普/学生近视,真的和座位有关吗?.pdf.txt\n",
"[OK] /data/投喂文献/科普/守住幼儿园预防近视的_第一道关口_.pdf -> /data/result/科普/守住幼儿园预防近视的_第一道关口_.pdf.txt\n",
"[OK] /data/投喂文献/科普/守护青少年的视力健康.pdf -> /data/result/科普/守护青少年的视力健康.pdf.txt\n",
"[OK] /data/投喂文献/科普/小学体育课中近视防控的睫状肌训练频次研究.pdf -> /data/result/科普/小学体育课中近视防控的睫状肌训练频次研究.pdf.txt\n",
"[OK] /data/投喂文献/科普/小学生近视防控系列校本课程的开发路径.pdf -> /data/result/科普/小学生近视防控系列校本课程的开发路径.pdf.txt\n",
"[OK] /data/投喂文献/科普/小学语文阅读中渗透近视防控的有效方法.pdf -> /data/result/科普/小学语文阅读中渗透近视防控的有效方法.pdf.txt\n",
"[OK] /data/投喂文献/科普/排球运动在高中生近视防控中的应用研究.pdf -> /data/result/科普/排球运动在高中生近视防控中的应用研究.pdf.txt\n",
"[OK] /data/投喂文献/科普/暑期别忘记近视防控.pdf -> /data/result/科普/暑期别忘记近视防控.pdf.txt\n",
"[OK] /data/投喂文献/科普/校医主导的中学生近视防控分层干预策略及实施效果研究.pdf -> /data/result/科普/校医主导的中学生近视防控分层干预策略及实施效果研究.pdf.txt\n",
"[OK] /data/投喂文献/科普/核心素养视域下中小学生运动技能学习与近视防控.pdf -> /data/result/科普/核心素养视域下中小学生运动技能学习与近视防控.pdf.txt\n",
"[OK] /data/投喂文献/科普/爱眼护眼宣传活动与青少年近视防控.pdf -> /data/result/科普/爱眼护眼宣传活动与青少年近视防控.pdf.txt\n",
"[OK] /data/投喂文献/科普/精细化眼科护理模式在小学生近视防控中的应用与效果分析.pdf -> /data/result/科普/精细化眼科护理模式在小学生近视防控中的应用与效果分析.pdf.txt\n",
"[OK] /data/投喂文献/科普/花样踢毽子教学训练对青少年近视防控的影响研究.pdf -> /data/result/科普/花样踢毽子教学训练对青少年近视防控的影响研究.pdf.txt\n",
"[OK] /data/投喂文献/科普/营养干预对青少年近视防控效果的询证研究.pdf -> /data/result/科普/营养干预对青少年近视防控效果的询证研究.pdf.txt\n",
"[OK] /data/投喂文献/科普/视动协调训练在小学生近视防控中的应用.pdf -> /data/result/科普/视动协调训练在小学生近视防控中的应用.pdf.txt\n",
"[OK] /data/投喂文献/科普/调整课桌高度防近视学校别忽视这件_小事_.pdf -> /data/result/科普/调整课桌高度防近视学校别忽视这件_小事_.pdf.txt\n",
"[OK] /data/投喂文献/科普/近视健康教育在青少年近视防控中的作用.pdf -> /data/result/科普/近视健康教育在青少年近视防控中的作用.pdf.txt\n",
"[OK] /data/投喂文献/科普/近视健康教育在青少年近视防控中的作用及其可行性分析.pdf -> /data/result/科普/近视健康教育在青少年近视防控中的作用及其可行性分析.pdf.txt\n",
"[OK] /data/投喂文献/科普/近视健康教育在青少年近视防控中的作用及其可行性探讨.pdf -> /data/result/科普/近视健康教育在青少年近视防控中的作用及其可行性探讨.pdf.txt\n",
"[OK] /data/投喂文献/科普/近视健康教育在青少年近视防控中的作用及其可行性探讨1.pdf -> /data/result/科普/近视健康教育在青少年近视防控中的作用及其可行性探讨1.pdf.txt\n",
"[OK] /data/投喂文献/科普/近视健康教育在青少年近视防控中的作用及其可行性探讨2.pdf -> /data/result/科普/近视健康教育在青少年近视防控中的作用及其可行性探讨2.pdf.txt\n",
"[OK] /data/投喂文献/科普/近视健康教育在青少年近视防控中的作用及可行性分析.pdf -> /data/result/科普/近视健康教育在青少年近视防控中的作用及可行性分析.pdf.txt\n",
"[OK] /data/投喂文献/科普/近视健康教育在青少年近视防控中的作用及探讨.pdf -> /data/result/科普/近视健康教育在青少年近视防控中的作用及探讨.pdf.txt\n",
"[OK] /data/投喂文献/科普/近视健康教育在青少年近视防控中的应用效果.pdf -> /data/result/科普/近视健康教育在青少年近视防控中的应用效果.pdf.txt\n",
"[OK] /data/投喂文献/科普/近视加重,危害有哪些.pdf -> /data/result/科普/近视加重,危害有哪些.pdf.txt\n",
"[OK] /data/投喂文献/科普/近视防控,这些知识你应该知道.pdf -> /data/result/科普/近视防控,这些知识你应该知道.pdf.txt\n",
"[OK] /data/投喂文献/科普/近视防控基地要切实守护青少年_心灵之窗_.pdf -> /data/result/科普/近视防控基地要切实守护青少年_心灵之窗_.pdf.txt\n",
"[OK] /data/投喂文献/科普/近视防控守护光明.pdf -> /data/result/科普/近视防控守护光明.pdf.txt\n",
"[OK] /data/投喂文献/科普/近视防控怎么做.pdf -> /data/result/科普/近视防控怎么做.pdf.txt\n",
"[OK] /data/投喂文献/科普/近视防控知多少.pdf -> /data/result/科普/近视防控知多少.pdf.txt\n",
"[OK] /data/投喂文献/科普/近视防控知识家长早知道.pdf -> /data/result/科普/近视防控知识家长早知道.pdf.txt\n",
"[OK] /data/投喂文献/科普/近视高发背景下,如何科学用眼.pdf -> /data/result/科普/近视高发背景下,如何科学用眼.pdf.txt\n",
"[OK] /data/投喂文献/科普/重视6岁以下儿童的近视防控.pdf -> /data/result/科普/重视6岁以下儿童的近视防控.pdf.txt\n",
"[OK] /data/投喂文献/科普/针灸对儿童低中度近视防控的临床观察.pdf -> /data/result/科普/针灸对儿童低中度近视防控的临床观察.pdf.txt\n",
"[OK] /data/投喂文献/科普/闲话成语,谈谈青少年近视防控问题.pdf -> /data/result/科普/闲话成语,谈谈青少年近视防控问题.pdf.txt\n",
"[OK] /data/投喂文献/科普/间歇远眺法联合传统眼保健操防控儿童近视的疗效观察.pdf -> /data/result/科普/间歇远眺法联合传统眼保健操防控儿童近视的疗效观察.pdf.txt\n",
"[OK] /data/投喂文献/科普/青少年儿童近视防控早知道.pdf -> /data/result/科普/青少年儿童近视防控早知道.pdf.txt\n",
"[OK] /data/投喂文献/科普/青少年近视与身体姿态异常的关系研究.pdf -> /data/result/科普/青少年近视与身体姿态异常的关系研究.pdf.txt\n",
"[OK] /data/投喂文献/科普/青少年近视防控中用近视健康教育在眼镜度数、健康行为改变效果观察.pdf -> /data/result/科普/青少年近视防控中用近视健康教育在眼镜度数、健康行为改变效果观察.pdf.txt\n",
"[OK] /data/投喂文献/科普/青少年近视防控科普知识.pdf -> /data/result/科普/青少年近视防控科普知识.pdf.txt\n",
"生成txt数量 54\n",
"[OK] /data/投喂文献/基础研究/815岁近视性屈光参差者调节及双眼视功能的相关研究.pdf -> /data/result/基础研究/815岁近视性屈光参差者调节及双眼视功能的相关研究.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/Bruch膜结构功能及其与近视的关系.pdf -> /data/result/基础研究/Bruch膜结构功能及其与近视的关系.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/Cx36在视网膜信号传导及近视进展中的作用研究进展.pdf -> /data/result/基础研究/Cx36在视网膜信号传导及近视进展中的作用研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/I型胶原基因与近视关系的研究进展.pdf -> /data/result/基础研究/I型胶原基因与近视关系的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/ON-OFF视觉通路在近视中的作用及其可能机制研究进展.pdf -> /data/result/基础研究/ON-OFF视觉通路在近视中的作用及其可能机制研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/Wnt_βcatenin信号通路与近视发病机制的关系.pdf -> /data/result/基础研究/Wnt_βcatenin信号通路与近视发病机制的关系.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/_3D视觉训练_新科技应用前景.pdf -> /data/result/基础研究/_3D视觉训练_新科技应用前景.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/一种预防近视的坐姿调整装置.pdf -> /data/result/基础研究/一种预防近视的坐姿调整装置.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/三级视功能训练教程在弱视治疗的应用.pdf -> /data/result/基础研究/三级视功能训练教程在弱视治疗的应用.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/不同光学镜片设计对近视进展控制效果的影响研究.pdf -> /data/result/基础研究/不同光学镜片设计对近视进展控制效果的影响研究.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/不同矫正方式对近视患者双眼视功能的影响.pdf -> /data/result/基础研究/不同矫正方式对近视患者双眼视功能的影响.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/不同矫正状态对儿童青少年近视进展的影响.pdf -> /data/result/基础研究/不同矫正状态对儿童青少年近视进展的影响.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/不同阿托品用药方案联合视功能训练对近视患者的疗效及对其眼调节灵敏度的影响.pdf -> /data/result/基础研究/不同阿托品用药方案联合视功能训练对近视患者的疗效及对其眼调节灵敏度的影响.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/不同验光方法在青少年近视眼防治中的临床意义.pdf -> /data/result/基础研究/不同验光方法在青少年近视眼防治中的临床意义.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/与青少年近视相关的环境因素分析.pdf -> /data/result/基础研究/与青少年近视相关的环境因素分析.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/以档案信息化赋能儿童青少年视力健康管理.pdf -> /data/result/基础研究/以档案信息化赋能儿童青少年视力健康管理.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/低度近视合并超高度散光验配OK镜控制近视1例.pdf -> /data/result/基础研究/低度近视合并超高度散光验配OK镜控制近视1例.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/低浓度阿托品在儿童近视眼防控中应用的研究进展.pdf -> /data/result/基础研究/低浓度阿托品在儿童近视眼防控中应用的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/低浓度阿托品治疗儿童青少年近视的应用及机制研究.pdf -> /data/result/基础研究/低浓度阿托品治疗儿童青少年近视的应用及机制研究.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/儿童和青少年近视控制方法的研究进展.pdf -> /data/result/基础研究/儿童和青少年近视控制方法的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/儿童至成年期屈光不正的发生机制与干预策略的研究进展.pdf -> /data/result/基础研究/儿童至成年期屈光不正的发生机制与干预策略的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/儿童轴性近视与屈光性近视进展比较一项3年队列研究.pdf -> /data/result/基础研究/儿童轴性近视与屈光性近视进展比较一项3年队列研究.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/儿童近视戴镜后复诊依从性对近视度数增加影响.pdf -> /data/result/基础研究/儿童近视戴镜后复诊依从性对近视度数增加影响.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/儿童青少年近视防控策略研究进展.pdf -> /data/result/基础研究/儿童青少年近视防控策略研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/光学干预对儿童近视防控的研究进展.pdf -> /data/result/基础研究/光学干预对儿童近视防控的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/光学离焦矫正方法的研究现状.pdf -> /data/result/基础研究/光学离焦矫正方法的研究现状.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/光线昼夜节律紊乱与近视的关系.pdf -> /data/result/基础研究/光线昼夜节律紊乱与近视的关系.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/兒童近視防控效果的臨床比較分析.pdf -> /data/result/基础研究/兒童近視防控效果的臨床比較分析.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/关于青少年近视影响因素及防控措施综述.pdf -> /data/result/基础研究/关于青少年近视影响因素及防控措施综述.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/分子水平近视发生机制的研究.pdf -> /data/result/基础研究/分子水平近视发生机制的研究.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/分析视觉训练对青少年近视防控的临床效果.pdf -> /data/result/基础研究/分析视觉训练对青少年近视防控的临床效果.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/单眼近视屈光参差儿童视网膜周边离焦和眼波前像差的临床分析.pdf -> /data/result/基础研究/单眼近视屈光参差儿童视网膜周边离焦和眼波前像差的临床分析.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/后巩膜加固术在高度近视治疗中的应用.pdf -> /data/result/基础研究/后巩膜加固术在高度近视治疗中的应用.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/周边屈光度与近视的研究进展.pdf -> /data/result/基础研究/周边屈光度与近视的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/周边离焦功能性眼镜与单焦点眼镜对儿童近视防控效果.pdf -> /data/result/基础研究/周边离焦功能性眼镜与单焦点眼镜对儿童近视防控效果.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/周边离焦学说在近视防控中的研究进展.pdf -> /data/result/基础研究/周边离焦学说在近视防控中的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/周边视网膜相对屈光度与近视进展的关系.pdf -> /data/result/基础研究/周边视网膜相对屈光度与近视进展的关系.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/周边视网膜离焦机制在近视进展中的作用.pdf -> /data/result/基础研究/周边视网膜离焦机制在近视进展中的作用.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/国际近视研究院(IMI) ———近视遗传学报告.pdf -> /data/result/基础研究/国际近视研究院(IMI) ———近视遗传学报告.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/国际近视研究院———2025摘要.pdf -> /data/result/基础研究/国际近视研究院———2025摘要.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/国际近视研究院关于婴幼儿高度近视管理与调查的报告.pdf -> /data/result/基础研究/国际近视研究院关于婴幼儿高度近视管理与调查的报告.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/基于家庭层面的儿童青少年近视防控影响因素定性研究.pdf -> /data/result/基础研究/基于家庭层面的儿童青少年近视防控影响因素定性研究.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/多区正向离焦镜片在近视防控中的应用进展.pdf -> /data/result/基础研究/多区正向离焦镜片在近视防控中的应用进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/多区正向离焦镜片延缓近视进展疗效评价的meta分析.pdf -> /data/result/基础研究/多区正向离焦镜片延缓近视进展疗效评价的meta分析.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/多区正向离焦镜片的真实世界近视防控效果.pdf -> /data/result/基础研究/多区正向离焦镜片的真实世界近视防控效果.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/多区正向离焦镜片联合 0.01%阿托品滴眼液在儿童近视防控中的应用.pdf -> /data/result/基础研究/多区正向离焦镜片联合 0.01%阿托品滴眼液在儿童近视防控中的应用.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/多巴胺在近视中的作用.pdf -> /data/result/基础研究/多巴胺在近视中的作用.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/多巴胺在近视形成中作用的研究进展.pdf -> /data/result/基础研究/多巴胺在近视形成中作用的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/天津市6~15岁中小学生睡眠与近视的关系.pdf -> /data/result/基础研究/天津市6~15岁中小学生睡眠与近视的关系.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/学龄儿童握笔姿势及握力与近视的相关性研究.pdf -> /data/result/基础研究/学龄儿童握笔姿势及握力与近视的相关性研究.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/学龄儿童近视的预防与控制.pdf -> /data/result/基础研究/学龄儿童近视的预防与控制.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/实验性近视的药物治疗机制.pdf -> /data/result/基础研究/实验性近视的药物治疗机制.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/家庭环境和行为因素与学龄儿童不同程度近视的关联作用分析.pdf -> /data/result/基础研究/家庭环境和行为因素与学龄儿童不同程度近视的关联作用分析.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/家庭采光照明环境和用眼行为与儿童近视的病例对照研究.pdf -> /data/result/基础研究/家庭采光照明环境和用眼行为与儿童近视的病例对照研究.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/小鼠近视模型的影响因素及研究进展.pdf -> /data/result/基础研究/小鼠近视模型的影响因素及研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/巩膜成纤维细胞在近视发病机制中的作用.pdf -> /data/result/基础研究/巩膜成纤维细胞在近视发病机制中的作用.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/常见近视动物模型及诱导方法.pdf -> /data/result/基础研究/常见近视动物模型及诱导方法.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/微结构离焦框架眼镜的光学区直径对近视控制效果的影响.pdf -> /data/result/基础研究/微结构离焦框架眼镜的光学区直径对近视控制效果的影响.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/户外活动与近视的关系.pdf -> /data/result/基础研究/户外活动与近视的关系.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/户外活动在近视防控中的作用.pdf -> /data/result/基础研究/户外活动在近视防控中的作用.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/户外活动对学生近视的影响及机制的研究分析.pdf -> /data/result/基础研究/户外活动对学生近视的影响及机制的研究分析.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/探究在学龄期青少年进展过快近视防控中予以 0.01% 阿托品滴眼液的应用价值.pdf -> /data/result/基础研究/探究在学龄期青少年进展过快近视防控中予以 0.01% 阿托品滴眼液的应用价值.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/探讨青少年儿童近视防控中采用角膜塑形镜与多点近视离焦镜片的对比效果.pdf -> /data/result/基础研究/探讨青少年儿童近视防控中采用角膜塑形镜与多点近视离焦镜片的对比效果.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/教室环境与学生近视的关联研究.pdf -> /data/result/基础研究/教室环境与学生近视的关联研究.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/斜视治疗方法的研究进展.pdf -> /data/result/基础研究/斜视治疗方法的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/无创性ON_OFF刺激在人类近视研究中的应用.pdf -> /data/result/基础研究/无创性ON_OFF刺激在人类近视研究中的应用.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/暂时陛近视和近视进展的研究.pdf -> /data/result/基础研究/暂时陛近视和近视进展的研究.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/炎症对近视发生发展影响的研究进展.pdf -> /data/result/基础研究/炎症对近视发生发展影响的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/用眼习惯评分和父母近视对初中生近视进展相关参数影响.pdf -> /data/result/基础研究/用眼习惯评分和父母近视对初中生近视进展相关参数影响.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/病理性近视的基因研究.pdf -> /data/result/基础研究/病理性近视的基因研究.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/病理性近视的研究进展.pdf -> /data/result/基础研究/病理性近视的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/病理性近视研究进展.pdf -> /data/result/基础研究/病理性近视研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/眼视光视觉教学训练在近视防控中的应用研究.pdf -> /data/result/基础研究/眼视光视觉教学训练在近视防控中的应用研究.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/眼部信号转导在近视发病机制中的作用.pdf -> /data/result/基础研究/眼部信号转导在近视发病机制中的作用.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/离焦性近视发病机制的研究进展.pdf -> /data/result/基础研究/离焦性近视发病机制的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/脉络膜在近视发展中的调控作用及其机制.pdf -> /data/result/基础研究/脉络膜在近视发展中的调控作用及其机制.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/视感知觉学习对年龄相关性黄斑变性患者黄斑光敏度影响研究.pdf -> /data/result/基础研究/视感知觉学习对年龄相关性黄斑变性患者黄斑光敏度影响研究.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/视知觉学习在青光眼患者视功能康复中的应用.pdf -> /data/result/基础研究/视知觉学习在青光眼患者视功能康复中的应用.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/视网膜信号分子在近视发生和进展中的作用.pdf -> /data/result/基础研究/视网膜信号分子在近视发生和进展中的作用.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/视黄酸在实验性近视发生中作用的研究进展.pdf -> /data/result/基础研究/视黄酸在实验性近视发生中作用的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/角膜塑形镜光学区大小与近视控制的关系.pdf -> /data/result/基础研究/角膜塑形镜光学区大小与近视控制的关系.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/角膜塑形镜在儿童近视防控中的应用效果及安全性探讨.pdf -> /data/result/基础研究/角膜塑形镜在儿童近视防控中的应用效果及安全性探讨.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/角膜塑形镜在近视控制中的研究进展.pdf -> /data/result/基础研究/角膜塑形镜在近视控制中的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/角膜塑形镜在近视防控中应用的研究进展.pdf -> /data/result/基础研究/角膜塑形镜在近视防控中应用的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/角膜塑形镜控制近视进展的研究现状.pdf -> /data/result/基础研究/角膜塑形镜控制近视进展的研究现状.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/角膜塑形镜离焦技术在青少年近视防控中的应用价值.pdf -> /data/result/基础研究/角膜塑形镜离焦技术在青少年近视防控中的应用价值.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/角膜塑形镜联合 0.0 1%阿托品滴眼液在青少年早期近视防控的有效性及安全性分析.pdf -> /data/result/基础研究/角膜塑形镜联合 0.0 1%阿托品滴眼液在青少年早期近视防控的有效性及安全性分析.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/角膜塑形镜配合减少周边远视离焦框架镜对儿童近视发展的回顾性分析.pdf -> /data/result/基础研究/角膜塑形镜配合减少周边远视离焦框架镜对儿童近视发展的回顾性分析.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/调节及双眼视功能与近视防控.pdf -> /data/result/基础研究/调节及双眼视功能与近视防控.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/调节神经支配与近视关系的研究进展.pdf -> /data/result/基础研究/调节神经支配与近视关系的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/调节训练对儿童及青少年近视防控效果的研究.pdf -> /data/result/基础研究/调节训练对儿童及青少年近视防控效果的研究.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/调节诱导的眼内压变化在近视发生发展中的作用.pdf -> /data/result/基础研究/调节诱导的眼内压变化在近视发生发展中的作用.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/运动员视觉保健.pdf -> /data/result/基础研究/运动员视觉保健.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/近视儿童家长近视防控认知-行为-需求的现况调查.pdf -> /data/result/基础研究/近视儿童家长近视防控认知-行为-需求的现况调查.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/近视儿童角膜塑形后角膜屈光力分布与眼轴增长之间的关系.pdf -> /data/result/基础研究/近视儿童角膜塑形后角膜屈光力分布与眼轴增长之间的关系.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/近视及近视性弱视研究进展.pdf -> /data/result/基础研究/近视及近视性弱视研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/近视发生发展中脉络膜变化的研究进展.pdf -> /data/result/基础研究/近视发生发展中脉络膜变化的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/近视发生发展的机制及环境因素的研究进展.pdf -> /data/result/基础研究/近视发生发展的机制及环境因素的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/近视发生和发展的遗传学研究进展.pdf -> /data/result/基础研究/近视发生和发展的遗传学研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/近视屈光参差儿童周边视网膜离焦状态分析.pdf -> /data/result/基础研究/近视屈光参差儿童周边视网膜离焦状态分析.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/近视性屈光参差儿童周边离焦与眼生物参数的相关性.pdf -> /data/result/基础研究/近视性屈光参差儿童周边离焦与眼生物参数的相关性.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/近视性离焦镜片联合调节灵敏度训练控制青少年近视的疗效.pdf -> /data/result/基础研究/近视性离焦镜片联合调节灵敏度训练控制青少年近视的疗效.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/近视患者周边视网膜病变与眼部生物学参数之间的关系.pdf -> /data/result/基础研究/近视患者周边视网膜病变与眼部生物学参数之间的关系.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/近视欠矫和足矫对学龄儿童近视进展影响的Meta分析.pdf -> /data/result/基础研究/近视欠矫和足矫对学龄儿童近视进展影响的Meta分析.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/近视的表观遗传学机制.pdf -> /data/result/基础研究/近视的表观遗传学机制.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/近视的遗传学研究.pdf -> /data/result/基础研究/近视的遗传学研究.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/近视眼与青光眼关系的研究进展.pdf -> /data/result/基础研究/近视眼与青光眼关系的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/近视眼像差与人眼客观焦深的关联性分析.pdf -> /data/result/基础研究/近视眼像差与人眼客观焦深的关联性分析.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/近视眼儿童配戴框架眼镜依从性自评问卷的编制及信度效度检验.pdf -> /data/result/基础研究/近视眼儿童配戴框架眼镜依从性自评问卷的编制及信度效度检验.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/近视眼动物模型的研究进展.pdf -> /data/result/基础研究/近视眼动物模型的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/近视眼相关信号通路的研究进展.pdf -> /data/result/基础研究/近视眼相关信号通路的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/近视眼视网膜的血氧代谢研究.pdf -> /data/result/基础研究/近视眼视网膜的血氧代谢研究.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/近视眼角膜中心厚度与近视度的关系.pdf -> /data/result/基础研究/近视眼角膜中心厚度与近视度的关系.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/近视眼防控相关框架眼镜研究进展.pdf -> /data/result/基础研究/近视眼防控相关框架眼镜研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/近视筛查对防控儿童近视的效果影响.pdf -> /data/result/基础研究/近视筛查对防控儿童近视的效果影响.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/近距离工作与青少年近视发生和发展的关系.pdf -> /data/result/基础研究/近距离工作与青少年近视发生和发展的关系.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/通过光学离焦原理控制近视的临床研究.pdf -> /data/result/基础研究/通过光学离焦原理控制近视的临床研究.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/重复低强度红光控制近视的机制及安全性.pdf -> /data/result/基础研究/重复低强度红光控制近视的机制及安全性.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/重视儿童近视前期的管理.pdf -> /data/result/基础研究/重视儿童近视前期的管理.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/间歇性外斜视术后应用视功能训练的临床效果观察.pdf -> /data/result/基础研究/间歇性外斜视术后应用视功能训练的临床效果观察.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/阿托品控制近视增长的研究进展.pdf -> /data/result/基础研究/阿托品控制近视增长的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/阿托品控制近视相关机制研究进展.pdf -> /data/result/基础研究/阿托品控制近视相关机制研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/阿托品控制近视进展及作用机制研究现状.pdf -> /data/result/基础研究/阿托品控制近视进展及作用机制研究现状.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/青少年中高度近视控制发展的临床研究.pdf -> /data/result/基础研究/青少年中高度近视控制发展的临床研究.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/青少年及儿童近视进展及其危险因素.pdf -> /data/result/基础研究/青少年及儿童近视进展及其危险因素.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/青少年近视影响因素及防控方法进展.pdf -> /data/result/基础研究/青少年近视影响因素及防控方法进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/青少年近视患者佩戴框架镜与夜戴型角膜塑形镜的效果比较.pdf -> /data/result/基础研究/青少年近视患者佩戴框架镜与夜戴型角膜塑形镜的效果比较.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/青少年近视治疗的研究进展.pdf -> /data/result/基础研究/青少年近视治疗的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/青少年近视现状及家庭环境影响因素研究.pdf -> /data/result/基础研究/青少年近视现状及家庭环境影响因素研究.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/青少年近视防控新进展.pdf -> /data/result/基础研究/青少年近视防控新进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/非斜视性视功能异常的近视患者行视功能训练对近视防控疗效评估.pdf -> /data/result/基础研究/非斜视性视功能异常的近视患者行视功能训练对近视防控疗效评估.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/高中生6258人视力及近视调查和近视的预防.pdf -> /data/result/基础研究/高中生6258人视力及近视调查和近视的预防.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/高度近视对眼部结构及视功能影响的研究现状与进展.pdf -> /data/result/基础研究/高度近视对眼部结构及视功能影响的研究现状与进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/高度近视眼球后壁形态分类及其与近视相关并发症关系研究进展.pdf -> /data/result/基础研究/高度近视眼球后壁形态分类及其与近视相关并发症关系研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/高度近视蛋白质组学研究及展望.pdf -> /data/result/基础研究/高度近视蛋白质组学研究及展望.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/高度近视遗传学研究进展.pdf -> /data/result/基础研究/高度近视遗传学研究进展.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/高近附加设计多焦软性角膜接触镜与角膜塑形镜对近视儿童调节的影响.pdf -> /data/result/基础研究/高近附加设计多焦软性角膜接触镜与角膜塑形镜对近视儿童调节的影响.pdf.txt\n",
"[OK] /data/投喂文献/基础研究/高非球微透镜控制儿童青少年近视进展效果分析.pdf -> /data/result/基础研究/高非球微透镜控制儿童青少年近视进展效果分析.pdf.txt\n",
"生成txt数量 138\n",
"[OK] /data/投喂文献/流行病学/2016-2021年上海市静安区中小学生视力不良及其影响因素分析.pdf -> /data/result/流行病学/2016-2021年上海市静安区中小学生视力不良及其影响因素分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/2017-2020年杭州市健康促进学校建设现况调查.pdf -> /data/result/流行病学/2017-2020年杭州市健康促进学校建设现况调查.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/2018-2020年北京市丰台区儿童青少年近视筛查结果分析.pdf -> /data/result/流行病学/2018-2020年北京市丰台区儿童青少年近视筛查结果分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/2019-2021年内蒙古自治区中小学生视力不良状况及影响因素分析.pdf -> /data/result/流行病学/2019-2021年内蒙古自治区中小学生视力不良状况及影响因素分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/2019-2021年内蒙古自治区儿童青少年近视流行趋势曲线模型分析.pdf -> /data/result/流行病学/2019-2021年内蒙古自治区儿童青少年近视流行趋势曲线模型分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/2020 年新津区中小学生常见病状况分析.pdf -> /data/result/流行病学/2020 年新津区中小学生常见病状况分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/2020年内蒙古自治区学生近视流行现状及相关影响因素分析.pdf -> /data/result/流行病学/2020年内蒙古自治区学生近视流行现状及相关影响因素分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/2021 年广西藤县儿童青少年近视筛查结果.pdf -> /data/result/流行病学/2021 年广西藤县儿童青少年近视筛查结果.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/2021—2024年永安市儿童青少年近视情况调查及影响因素分析.pdf -> /data/result/流行病学/2021—2024年永安市儿童青少年近视情况调查及影响因素分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/2021年浙江省台州市中小学生视力低下现状及影响因素分析.pdf -> /data/result/流行病学/2021年浙江省台州市中小学生视力低下现状及影响因素分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/2021年贵州省学生近视流行现状及戴镜情况分析.pdf -> /data/result/流行病学/2021年贵州省学生近视流行现状及戴镜情况分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/2022—2024年广东省肇庆市四会市学生近视流行现状.pdf -> /data/result/流行病学/2022—2024年广东省肇庆市四会市学生近视流行现状.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/2023 年从化区学生常见病数据分析.pdf -> /data/result/流行病学/2023 年从化区学生常见病数据分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/2023 年南京市鼓楼区学龄前儿童近视筛查结果分析.pdf -> /data/result/流行病学/2023 年南京市鼓楼区学龄前儿童近视筛查结果分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/2023年中国6省儿童青少年视力不良与筛查性近视分布情况.pdf -> /data/result/流行病学/2023年中国6省儿童青少年视力不良与筛查性近视分布情况.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/2023年宜昌市西陵区儿童青少年常见病监测结果分析.pdf -> /data/result/流行病学/2023年宜昌市西陵区儿童青少年常见病监测结果分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/2024 年沂源县中小学生视力现状与防控策略研究.pdf -> /data/result/流行病学/2024 年沂源县中小学生视力现状与防控策略研究.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/三门县城关高中学生视力调查.pdf -> /data/result/流行病学/三门县城关高中学生视力调查.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/上海市杨浦区幼儿园儿童眼轴的光学相干生物测量调查与分析.pdf -> /data/result/流行病学/上海市杨浦区幼儿园儿童眼轴的光学相干生物测量调查与分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/上海市闵行区学校视力监测情况分析.pdf -> /data/result/流行病学/上海市闵行区学校视力监测情况分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/中国6省不同出生年代父母的近视情况对其39岁子女近视的影响分析.pdf -> /data/result/流行病学/中国6省不同出生年代父母的近视情况对其39岁子女近视的影响分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/九江市小学生近视现状及影响因素分析.pdf -> /data/result/流行病学/九江市小学生近视现状及影响因素分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/云南省中小学生筛查性近视影响因素分析.pdf -> /data/result/流行病学/云南省中小学生筛查性近视影响因素分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/云南省临沧市小学生近视现况调查.pdf -> /data/result/流行病学/云南省临沧市小学生近视现况调查.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/儿童屈光不正的流行病学调查及矫正方案分析.pdf -> /data/result/流行病学/儿童屈光不正的流行病学调查及矫正方案分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/关于学生近视发病率的调查及当前的学生近视问题.pdf -> /data/result/流行病学/关于学生近视发病率的调查及当前的学生近视问题.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/关于我国大学生近视现状及影响因素认知的研究分析.pdf -> /data/result/流行病学/关于我国大学生近视现状及影响因素认知的研究分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/北京市中学生视力现状与影响因素研究.pdf -> /data/result/流行病学/北京市中学生视力现状与影响因素研究.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/北京市大兴区 5 所学校学生近视及影响因素分析.pdf -> /data/result/流行病学/北京市大兴区 5 所学校学生近视及影响因素分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/北京市海淀区某小学学生近视的危险因素和干预对策研究.pdf -> /data/result/流行病学/北京市海淀区某小学学生近视的危险因素和干预对策研究.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/北京市门头沟区小学生近视及其相关危险因素分析.pdf -> /data/result/流行病学/北京市门头沟区小学生近视及其相关危险因素分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/四川省凉山州东五县少年儿童眼病筛查.pdf -> /data/result/流行病学/四川省凉山州东五县少年儿童眼病筛查.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/基于 2023 年北京市某小学学生屈光状态筛查.pdf -> /data/result/流行病学/基于 2023 年北京市某小学学生屈光状态筛查.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/基于偏联系数的某省中小学生近视趋势分析.pdf -> /data/result/流行病学/基于偏联系数的某省中小学生近视趋势分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/基于南京市 J 区 C 街道青少年近视情况的调查研究.pdf -> /data/result/流行病学/基于南京市 J 区 C 街道青少年近视情况的调查研究.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/天津市北辰区7~12岁儿童近视患病率及远视储备调查.pdf -> /data/result/流行病学/天津市北辰区7~12岁儿童近视患病率及远视储备调查.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/奉化市2001-2009年高三学生近视情况调查.pdf -> /data/result/流行病学/奉化市2001-2009年高三学生近视情况调查.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/学生视力健康状况会影响学业成绩吗——基于全国十省市的调查数据.pdf -> /data/result/流行病学/学生视力健康状况会影响学业成绩吗——基于全国十省市的调查数据.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/宁波市中小学生近视患病率调查.pdf -> /data/result/流行病学/宁波市中小学生近视患病率调查.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/安康市环境富硒和贫硒地区小学生和中学生近视情况分析.pdf -> /data/result/流行病学/安康市环境富硒和贫硒地区小学生和中学生近视情况分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/射阳县中小学生常见病监测结果分析.pdf -> /data/result/流行病学/射阳县中小学生常见病监测结果分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/小学生筛查性近视与主要带养人报告近视的差异研究.pdf -> /data/result/流行病学/小学生筛查性近视与主要带养人报告近视的差异研究.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/小学生近视检出率及影响因素研究.pdf -> /data/result/流行病学/小学生近视检出率及影响因素研究.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/山西省2019年中小学生视力不良现状分析.pdf -> /data/result/流行病学/山西省2019年中小学生视力不良现状分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/山西省四至六年级小学生近视现状及影响因素分析.pdf -> /data/result/流行病学/山西省四至六年级小学生近视现状及影响因素分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/广州市越秀区青少年近视特点和危险因素分析.pdf -> /data/result/流行病学/广州市越秀区青少年近视特点和危险因素分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/广州荔湾区398例儿童FTD和PPA与眼轴长相关性研究.pdf -> /data/result/流行病学/广州荔湾区398例儿童FTD和PPA与眼轴长相关性研究.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/成年人近视的流行病学及其相关危险因素的研究进展.pdf -> /data/result/流行病学/成年人近视的流行病学及其相关危险因素的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/我国儿童青少年高度近视眼流行病学现状.pdf -> /data/result/流行病学/我国儿童青少年高度近视眼流行病学现状.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/我国初中生视力的影响因素研究_——基于CEPS(2014—2015学年)追访数据的多项Logisti.pdf -> /data/result/流行病学/我国初中生视力的影响因素研究_——基于CEPS(2014—2015学年)追访数据的多项Logisti.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/探析丰台区部分学校青少年近视的情况.pdf -> /data/result/流行病学/探析丰台区部分学校青少年近视的情况.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/新疆K市2023—2024年小学生近视调查.pdf -> /data/result/流行病学/新疆K市2023—2024年小学生近视调查.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/新疆喀什地区英吉沙县儿童青少年弱视患病率和影响因素分析.pdf -> /data/result/流行病学/新疆喀什地区英吉沙县儿童青少年弱视患病率和影响因素分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/新疆英吉沙县儿童和青少年近视前期患病率及其相关因素分析.pdf -> /data/result/流行病学/新疆英吉沙县儿童和青少年近视前期患病率及其相关因素分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/景宁县畲汉族中学生视力低下状况调查.pdf -> /data/result/流行病学/景宁县畲汉族中学生视力低下状况调查.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/枣庄地区青少年视力健康状况及其影响.pdf -> /data/result/流行病学/枣庄地区青少年视力健康状况及其影响.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/某高校本科生视力不良情况调查与分析.pdf -> /data/result/流行病学/某高校本科生视力不良情况调查与分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/武城县小学生近视眼患病率流行病学调查及危险因素评价.pdf -> /data/result/流行病学/武城县小学生近视眼患病率流行病学调查及危险因素评价.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/民勤县小学生各年龄段眼轴变化观察研究.pdf -> /data/result/流行病学/民勤县小学生各年龄段眼轴变化观察研究.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/永安市中小学生近视流行现状、近视影响因素及防控措施探讨.pdf -> /data/result/流行病学/永安市中小学生近视流行现状、近视影响因素及防控措施探讨.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/永泰地区青少年近视率的现状与趋势分析.pdf -> /data/result/流行病学/永泰地区青少年近视率的现状与趋势分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/江西省九江市德安县学生近视筛查的现况调查和影响因素分析.pdf -> /data/result/流行病学/江西省九江市德安县学生近视筛查的现况调查和影响因素分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/沂源县中小学生视力健康情况调研报告.pdf -> /data/result/流行病学/沂源县中小学生视力健康情况调研报告.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/沙湾区中小学生生长发育及其常见病状况调查.pdf -> /data/result/流行病学/沙湾区中小学生生长发育及其常见病状况调查.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/河南省中小学生视力状况与户外活动相关分析.pdf -> /data/result/流行病学/河南省中小学生视力状况与户外活动相关分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/济南市青少年儿童视力筛查结果分析.pdf -> /data/result/流行病学/济南市青少年儿童视力筛查结果分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/浙江省小学生用眼卫生相关知识态度行为现况研究.pdf -> /data/result/流行病学/浙江省小学生用眼卫生相关知识态度行为现况研究.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/浙江省建德市部分学校调查.pdf -> /data/result/流行病学/浙江省建德市部分学校调查.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/海南省中职学生近视相关因素及防控对策研究.pdf -> /data/result/流行病学/海南省中职学生近视相关因素及防控对策研究.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/海陵区 2022 年学生视力监测情况.pdf -> /data/result/流行病学/海陵区 2022 年学生视力监测情况.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/深圳市龙华区46岁儿童近视与婴幼儿期视屏接触时间的关系分析.pdf -> /data/result/流行病学/深圳市龙华区46岁儿童近视与婴幼儿期视屏接触时间的关系分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/湘西少数民族地区中小学生近视影响因素分析.pdf -> /data/result/流行病学/湘西少数民族地区中小学生近视影响因素分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/澳门青少年视力影响因素分析及防控措施建议.pdf -> /data/result/流行病学/澳门青少年视力影响因素分析及防控措施建议.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/福清市2020—2022年城区中小学生视力情况及相关因素分析.pdf -> /data/result/流行病学/福清市2020—2022年城区中小学生视力情况及相关因素分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/空气污染暴露对儿童青少年近视影响的研究进展.pdf -> /data/result/流行病学/空气污染暴露对儿童青少年近视影响的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/综合防控小学生近视的评价体系与预警平台建设.pdf -> /data/result/流行病学/综合防控小学生近视的评价体系与预警平台建设.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/芜湖市中小学生近视情况调查分析.pdf -> /data/result/流行病学/芜湖市中小学生近视情况调查分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/西部农村学校同伴效应对青少年近视的影响.pdf -> /data/result/流行病学/西部农村学校同伴效应对青少年近视的影响.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/贵州省锦屏县2021年学生常见病和健康影响.pdf -> /data/result/流行病学/贵州省锦屏县2021年学生常见病和健康影响.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/运城市盐湖区2018年儿童青少年近视调查结果.pdf -> /data/result/流行病学/运城市盐湖区2018年儿童青少年近视调查结果.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/连南县城区小学生近视防控意识的研究.pdf -> /data/result/流行病学/连南县城区小学生近视防控意识的研究.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/重庆市小学生家长近视成因认知现况及科普需求的质性分析.pdf -> /data/result/流行病学/重庆市小学生家长近视成因认知现况及科普需求的质性分析.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/青少年健康生活方式与超重_肥胖及近视共患关联的队列研究.pdf -> /data/result/流行病学/青少年健康生活方式与超重_肥胖及近视共患关联的队列研究.pdf.txt\n",
"[OK] /data/投喂文献/流行病学/青少年儿童用眼卫生情况的横断面研究.pdf -> /data/result/流行病学/青少年儿童用眼卫生情况的横断面研究.pdf.txt\n",
"生成txt数量 84\n",
"[OK] /data/投喂文献/普通文献/OCT测量黄斑区神经节细胞复合体厚度在高度近视眼中的应用进展.pdf -> /data/result/普通文献/OCT测量黄斑区神经节细胞复合体厚度在高度近视眼中的应用进展.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/一种用于近视防控的新型载药离焦软镜.pdf -> /data/result/普通文献/一种用于近视防控的新型载药离焦软镜.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/不同低浓度阿托品制剂对中小学生近视防控的效果.pdf -> /data/result/普通文献/不同低浓度阿托品制剂对中小学生近视防控的效果.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/不同近视治疗手段与脉络膜厚度及血流的研究进展.pdf -> /data/result/普通文献/不同近视治疗手段与脉络膜厚度及血流的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/中医热敷与西医视力训练联合干预.pdf -> /data/result/普通文献/中医热敷与西医视力训练联合干预.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/中小学生课业负担实证研究.pdf -> /data/result/普通文献/中小学生课业负担实证研究.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/人工智能在近视防控与治疗中的应用进展.pdf -> /data/result/普通文献/人工智能在近视防控与治疗中的应用进展.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/人工智能在近视防控中的初步应用.pdf -> /data/result/普通文献/人工智能在近视防控中的初步应用.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/低强度单波长红光仪与针刺疗法防控青少年儿童近视的疗效对比分析.pdf -> /data/result/普通文献/低强度单波长红光仪与针刺疗法防控青少年儿童近视的疗效对比分析.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/低浓度阿托品对形觉剥夺性近视豚鼠视网膜多巴胺的影响研究.pdf -> /data/result/普通文献/低浓度阿托品对形觉剥夺性近视豚鼠视网膜多巴胺的影响研究.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/低浓度阿托品控制近视的临床研究进展.pdf -> /data/result/普通文献/低浓度阿托品控制近视的临床研究进展.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/低浓度阿托品滴眼液在青少年近视防控中的疗效分析.pdf -> /data/result/普通文献/低浓度阿托品滴眼液在青少年近视防控中的疗效分析.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/低浓度阿托品滴眼液在青少年近视防控中的疗效分析2.pdf -> /data/result/普通文献/低浓度阿托品滴眼液在青少年近视防控中的疗效分析2.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/低浓度阿托品滴眼液在青少年近视防控中的疗效分析3.pdf -> /data/result/普通文献/低浓度阿托品滴眼液在青少年近视防控中的疗效分析3.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/体医融合应对青少年近视的协同问卷编制及检验.pdf -> /data/result/普通文献/体医融合应对青少年近视的协同问卷编制及检验.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/体医融合视域下我国农村青少年近视多元主体协同防控研究.pdf -> /data/result/普通文献/体医融合视域下我国农村青少年近视多元主体协同防控研究.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/体育教学中防控近视的睫状肌训练频次研究.pdf -> /data/result/普通文献/体育教学中防控近视的睫状肌训练频次研究.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/体育活动防控儿童青少年近视的Meta分析.pdf -> /data/result/普通文献/体育活动防控儿童青少年近视的Meta分析.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/儿童及青少年近视的药物防控及其进展.pdf -> /data/result/普通文献/儿童及青少年近视的药物防控及其进展.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/儿童近视防控中家庭视觉健康护理干预的路径设计.pdf -> /data/result/普通文献/儿童近视防控中家庭视觉健康护理干预的路径设计.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/儿童近视防控的家庭护理探究.pdf -> /data/result/普通文献/儿童近视防控的家庭护理探究.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/光学生物测量在眼部的测量方法及应用.pdf -> /data/result/普通文献/光学生物测量在眼部的测量方法及应用.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/分析眼轴测量监控角膜塑形镜在青少年近视防控中的作用.pdf -> /data/result/普通文献/分析眼轴测量监控角膜塑形镜在青少年近视防控中的作用.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/协同治理视角下体医融合应对青少年近视的协同评价模型研究.pdf -> /data/result/普通文献/协同治理视角下体医融合应对青少年近视的协同评价模型研究.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/同伴教育对近视中小学生视力相关生存质量干预效果分析.pdf -> /data/result/普通文献/同伴教育对近视中小学生视力相关生存质量干预效果分析.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/同心双焦软性角膜接触镜的短期临床有效性及满意度分析.pdf -> /data/result/普通文献/同心双焦软性角膜接触镜的短期临床有效性及满意度分析.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/基于Android的青少年视力防控App设计与开发.pdf -> /data/result/普通文献/基于Android的青少年视力防控App设计与开发.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/基于OKAI人工智能系统分析耳穴埋豆联合全息刮痧防控近视的临床研究.pdf -> /data/result/普通文献/基于OKAI人工智能系统分析耳穴埋豆联合全息刮痧防控近视的临床研究.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/基于“治未病”思想探析中医外治法在儿童青少年近视防控中的应用.pdf -> /data/result/普通文献/基于“治未病”思想探析中医外治法在儿童青少年近视防控中的应用.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/基于低相干干涉测量的眼轴长度测量系统研究_中国激光.pdf -> /data/result/普通文献/基于低相干干涉测量的眼轴长度测量系统研究_中国激光.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/基于幼儿视角下的多元化护眼策略.pdf -> /data/result/普通文献/基于幼儿视角下的多元化护眼策略.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/基于数据挖掘探讨针灸防控儿童青少年近视选穴规律.pdf -> /data/result/普通文献/基于数据挖掘探讨针灸防控儿童青少年近视选穴规律.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/基于网络云计算的视知觉训练在弱视治疗中的应用.pdf -> /data/result/普通文献/基于网络云计算的视知觉训练在弱视治疗中的应用.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/多光谱屈光地形图MRT联合思问离焦镜片延缓近视增长的有效性分析.pdf -> /data/result/普通文献/多光谱屈光地形图MRT联合思问离焦镜片延缓近视增长的有效性分析.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/多环离焦软镜与角膜塑形镜在青少年近视防控中的应用价值比较.pdf -> /data/result/普通文献/多环离焦软镜与角膜塑形镜在青少年近视防控中的应用价值比较.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/多环离焦软镜在青少年近视防控中的应用分析.pdf -> /data/result/普通文献/多环离焦软镜在青少年近视防控中的应用分析.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/学前教育阶段遏制幼儿视力下降的对策研究.pdf -> /data/result/普通文献/学前教育阶段遏制幼儿视力下降的对策研究.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/学校-家庭综合预防模式对中小学生近视的干预效果分析.pdf -> /data/result/普通文献/学校-家庭综合预防模式对中小学生近视的干预效果分析.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/学校卫生管理对中小学生近视防控的效果分析.pdf -> /data/result/普通文献/学校卫生管理对中小学生近视防控的效果分析.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/学龄前儿童的早教对于近视防控的启示.pdf -> /data/result/普通文献/学龄前儿童的早教对于近视防控的启示.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/定期 IOLmaster700 眼轴测量在近视防控中的重要性.pdf -> /data/result/普通文献/定期 IOLmaster700 眼轴测量在近视防控中的重要性.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/家庭结构与健康风险_基于青少年近视的实证分析.pdf -> /data/result/普通文献/家庭结构与健康风险_基于青少年近视的实证分析.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/家庭防控儿童青少年近视存在问题及改进策略.pdf -> /data/result/普通文献/家庭防控儿童青少年近视存在问题及改进策略.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/巩膜交联法防治近视的基础研究进展.pdf -> /data/result/普通文献/巩膜交联法防治近视的基础研究进展.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/巩膜胶原交联术在近视中的研究进展.pdf -> /data/result/普通文献/巩膜胶原交联术在近视中的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/广州双生子眼病研究_对近视防控的启示.pdf -> /data/result/普通文献/广州双生子眼病研究_对近视防控的启示.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/开环状态下调节的暗焦点研究进展.pdf -> /data/result/普通文献/开环状态下调节的暗焦点研究进展.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/户外光照和运动在防治近视中的研究进展.pdf -> /data/result/普通文献/户外光照和运动在防治近视中的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/户外活动与青少年近视防控的关系研究.pdf -> /data/result/普通文献/户外活动与青少年近视防控的关系研究.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/户外活动对近视防控作用及其机制的研究进展.pdf -> /data/result/普通文献/户外活动对近视防控作用及其机制的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/揿针干预儿童青少年假性近视的研究进展.pdf -> /data/result/普通文献/揿针干预儿童青少年假性近视的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/改善儿童眼健康的睫状肌运动表象训练方案设计.pdf -> /data/result/普通文献/改善儿童眼健康的睫状肌运动表象训练方案设计.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/新型冠状病毒的眼睛侵入途径与眼科防护重点.pdf -> /data/result/普通文献/新型冠状病毒的眼睛侵入途径与眼科防护重点.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/日戴或夜戴硬性角膜接触镜对青少年近视防控效果的影响研究.pdf -> /data/result/普通文献/日戴或夜戴硬性角膜接触镜对青少年近视防控效果的影响研究.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/早产儿近视性屈光不正的研究进展.pdf -> /data/result/普通文献/早产儿近视性屈光不正的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/昼夜节律与儿童近视发生机制的研究进展.pdf -> /data/result/普通文献/昼夜节律与儿童近视发生机制的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/智能传感器下的防近视眼镜设计.pdf -> /data/result/普通文献/智能传感器下的防近视眼镜设计.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/浅谈中学生近视防控中家庭学校社会联合干预的可能性.pdf -> /data/result/普通文献/浅谈中学生近视防控中家庭学校社会联合干预的可能性.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/消旋山莨菪碱(0.05%)防治近视的临床观察.pdf -> /data/result/普通文献/消旋山莨菪碱(0.05%)防治近视的临床观察.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/短波光防护眼镜控制儿童近视发展的效果评估.pdf -> /data/result/普通文献/短波光防护眼镜控制儿童近视发展的效果评估.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/研究视觉训练对青少年近视防控的临床效果与满意度情况.pdf -> /data/result/普通文献/研究视觉训练对青少年近视防控的临床效果与满意度情况.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/精细化眼科护理模式在小学生近视防控中的应用价值分析.pdf -> /data/result/普通文献/精细化眼科护理模式在小学生近视防控中的应用价值分析.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/紫光影响脉络膜厚度及血流动态平衡抑制近视的机制研究及近视防控新策略.pdf -> /data/result/普通文献/紫光影响脉络膜厚度及血流动态平衡抑制近视的机制研究及近视防控新策略.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/红光疗法在眼科疾病中的应用研究进展.pdf -> /data/result/普通文献/红光疗法在眼科疾病中的应用研究进展.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/耳穴压丸在近视防控中的进展.pdf -> /data/result/普通文献/耳穴压丸在近视防控中的进展.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/耳穴疗法联合揿针与0.01%阿托品治疗儿童青少年轻度近视随机对照研究.pdf -> /data/result/普通文献/耳穴疗法联合揿针与0.01%阿托品治疗儿童青少年轻度近视随机对照研究.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/脉络膜在角膜塑形镜控制近视中的作用.pdf -> /data/result/普通文献/脉络膜在角膜塑形镜控制近视中的作用.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/视觉综合护理干预对儿童近视的预防效果探讨.pdf -> /data/result/普通文献/视觉综合护理干预对儿童近视的预防效果探讨.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/运动视觉训练在小学体育教学中的应用.pdf -> /data/result/普通文献/运动视觉训练在小学体育教学中的应用.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/近视干预中阿托品滴眼液的临床应用与机制研究进展.pdf -> /data/result/普通文献/近视干预中阿托品滴眼液的临床应用与机制研究进展.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/近视眼防控的研究进展.pdf -> /data/result/普通文献/近视眼防控的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/近视防控措施在青少年群体中的应用效果.pdf -> /data/result/普通文献/近视防控措施在青少年群体中的应用效果.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/近视防控措施的有效性评估与分析.pdf -> /data/result/普通文献/近视防控措施的有效性评估与分析.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/近视防控:从生活习惯到科学矫正的全攻略.pdf -> /data/result/普通文献/近视防控:从生活习惯到科学矫正的全攻略.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/近视防治方法的研究进展.pdf -> /data/result/普通文献/近视防治方法的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/近视预防干预措施对中学生近视的防控效果.pdf -> /data/result/普通文献/近视预防干预措施对中学生近视的防控效果.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/通过眼轴测量监控角膜塑形镜在青少年近视防控中的作用.pdf -> /data/result/普通文献/通过眼轴测量监控角膜塑形镜在青少年近视防控中的作用.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/阿托品在近视防控中不同作用机制的研究进展.pdf -> /data/result/普通文献/阿托品在近视防控中不同作用机制的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/阿托品在青少年近视防控中的应用.pdf -> /data/result/普通文献/阿托品在青少年近视防控中的应用.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/阿托品滴眼液用于儿童近视防控的伦理探究 (1).pdf -> /data/result/普通文献/阿托品滴眼液用于儿童近视防控的伦理探究 (1).pdf.txt\n",
"[OK] /data/投喂文献/普通文献/阿托品近视防控的临床与实验室研究进展.pdf -> /data/result/普通文献/阿托品近视防控的临床与实验室研究进展.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/青少年近视的预防及研究进展.pdf -> /data/result/普通文献/青少年近视的预防及研究进展.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/青少年近视相关影响因素的研究进展.pdf -> /data/result/普通文献/青少年近视相关影响因素的研究进展.pdf.txt\n",
"[OK] /data/投喂文献/普通文献/青少年近视防控医疗质量控制的研究现状.pdf -> /data/result/普通文献/青少年近视防控医疗质量控制的研究现状.pdf.txt\n",
"生成txt数量 84\n",
"[OK] /data/投喂文献/政策报告/三部门加强儿童青少年近视防控_不得占用假期补课.pdf -> /data/result/政策报告/三部门加强儿童青少年近视防控_不得占用假期补课.pdf.txt\n",
"[OK] /data/投喂文献/政策报告/中国儿童近视状况研究.pdf -> /data/result/政策报告/中国儿童近视状况研究.pdf.txt\n",
"[OK] /data/投喂文献/政策报告/健康中国战略背景下儿童青少年近视现状调查及防控政策研究.pdf -> /data/result/政策报告/健康中国战略背景下儿童青少年近视现状调查及防控政策研究.pdf.txt\n",
"[OK] /data/投喂文献/政策报告/健康中国背景下我国青少年近视防控的政策工具研究.pdf -> /data/result/政策报告/健康中国背景下我国青少年近视防控的政策工具研究.pdf.txt\n",
"[OK] /data/投喂文献/政策报告/儿童青少年近视率居世界第一近视防控势在必行.pdf -> /data/result/政策报告/儿童青少年近视率居世界第一近视防控势在必行.pdf.txt\n",
"[OK] /data/投喂文献/政策报告/公共卫生视角下青少年近视防控健康模式构建与实践.pdf -> /data/result/政策报告/公共卫生视角下青少年近视防控健康模式构建与实践.pdf.txt\n",
"[OK] /data/投喂文献/政策报告/国家卫生健康委办公厅关于印发防控儿童青少年近视核心知识十条的通知.pdf -> /data/result/政策报告/国家卫生健康委办公厅关于印发防控儿童青少年近视核心知识十条的通知.pdf.txt\n",
"[OK] /data/投喂文献/政策报告/国家卫生健康委办公厅发布《关于防控儿童青少年近视核心知识十条的通知》.pdf -> /data/result/政策报告/国家卫生健康委办公厅发布《关于防控儿童青少年近视核心知识十条的通知》.pdf.txt\n",
"[OK] /data/投喂文献/政策报告/多措并举种好近视防控基地标准化_试验田_.pdf -> /data/result/政策报告/多措并举种好近视防控基地标准化_试验田_.pdf.txt\n",
"[OK] /data/投喂文献/政策报告/我国学校近视防控的短板与对策.pdf -> /data/result/政策报告/我国学校近视防控的短板与对策.pdf.txt\n",
"[OK] /data/投喂文献/政策报告/我国青少年近视防控实施建议.pdf -> /data/result/政策报告/我国青少年近视防控实施建议.pdf.txt\n",
"[OK] /data/投喂文献/政策报告/新形势下医疗机构怎样做好近视防控.pdf -> /data/result/政策报告/新形势下医疗机构怎样做好近视防控.pdf.txt\n",
"[OK] /data/投喂文献/政策报告/构建陕西省青少年体育健康视力规范管理对近视防控体系建设的重要作用.pdf -> /data/result/政策报告/构建陕西省青少年体育健康视力规范管理对近视防控体系建设的重要作用.pdf.txt\n",
"[OK] /data/投喂文献/政策报告/浙江省青少年近视防控体医融合治理研究.pdf -> /data/result/政策报告/浙江省青少年近视防控体医融合治理研究.pdf.txt\n",
"[OK] /data/投喂文献/政策报告/社会生态学视角下青少年儿童近视防控对策研究.pdf -> /data/result/政策报告/社会生态学视角下青少年儿童近视防控对策研究.pdf.txt\n",
"[OK] /data/投喂文献/政策报告/近视疾病经济负担研究进展.pdf -> /data/result/政策报告/近视疾病经济负担研究进展.pdf.txt\n",
"[OK] /data/投喂文献/政策报告/近视诊治的社会经济负担评估进展.pdf -> /data/result/政策报告/近视诊治的社会经济负担评估进展.pdf.txt\n",
"[OK] /data/投喂文献/政策报告/近视防控要基地引领也要全社会参与.pdf -> /data/result/政策报告/近视防控要基地引领也要全社会参与.pdf.txt\n",
"[OK] /data/投喂文献/政策报告/青少年近视综合防控的基本原则与对策.pdf -> /data/result/政策报告/青少年近视综合防控的基本原则与对策.pdf.txt\n",
"生成txt数量 19\n"
]
}
],
"source": [
"source_path = \"/data/投喂文献/\"\n",
"\n",
"#批量OCR\n",
"files = os.listdir(source_path)\n",
"for f in files:\n",
" input_dir = os.path.join(source_path,f)\n",
" \n",
" output_dir = os.path.join(\"/data/result\",f)\n",
" # save_images = \"/data/output/image\" # 或者 True / False\n",
" save_images = False\n",
" \n",
" outs = extract_folder_to_txts(input_dir, output_dir=output_dir, save_images=save_images)\n",
" print(\"生成txt数量\", len(outs))"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "316db255-654f-4aad-880e-8724038b6063",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['核心文献', '科普', '基础研究', '流行病学', '普通文献', '政策报告']"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"files"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "689d2f3f-3c3e-4a16-9e77-0ee9112318ee",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[OK] /data/docs/1_远像光屏临床研究论文合集20220208.pdf -> /data/output/1_远像光屏临床研究论文合集20220208.pdf.txt\n",
"[OK] /data/docs/1_阿托品防控近视眼的安全性研究进展.pdf -> /data/output/1_阿托品防控近视眼的安全性研究进展.pdf.txt\n",
"[OK] /data/docs/2022年中国儿童青少年视觉健康白皮书-艾瑞咨询-202206.pdf -> /data/output/2022年中国儿童青少年视觉健康白皮书-艾瑞咨询-202206.pdf.txt\n",
"[OK] /data/docs/Q&A汇集.pdf -> /data/output/Q&A汇集.pdf.txt\n",
"[OK] /data/docs/c0e7523d3b5e27c903eb8748a475520c.docx -> /data/output/c0e7523d3b5e27c903eb8748a475520c.docx.txt\n",
"[OK] /data/docs/iAVT相关背景-睿视.pdf -> /data/output/iAVT相关背景-睿视.pdf.txt\n",
"[OK] /data/docs/iavt技术深度解释材料.pdf -> /data/output/iavt技术深度解释材料.pdf.txt\n",
"[OK] /data/docs/《2023眼视光行业白皮书》.pdf -> /data/output/《2023眼视光行业白皮书》.pdf.txt\n",
"[OK] /data/docs/中国儿童弱视防治专家共识2021年.pdf -> /data/output/中国儿童弱视防治专家共识2021年.pdf.txt\n",
"[OK] /data/docs/儿童屈光矫正专家共识(2017).pdf -> /data/output/儿童屈光矫正专家共识(2017).pdf.txt\n",
"[OK] /data/docs/关于视训的相关问题解答.docx -> /data/output/关于视训的相关问题解答.docx.txt\n",
"[OK] /data/docs/基于自适应对比度检测训练的知觉学习在间歇性外斜视中的应用:一项前瞻性双盲随机对照试验.PDF -> /data/output/基于自适应对比度检测训练的知觉学习在间歇性外斜视中的应用:一项前瞻性双盲随机对照试验.pdf.txt\n",
"[OK] /data/docs/备忘录文档_202602121309.pdf -> /data/output/备忘录文档_202602121309.pdf.txt\n",
"[OK] /data/docs/对比敏感度专家共识 20230510.docx -> /data/output/对比敏感度专家共识 20230510.docx.txt\n",
"[OK] /data/docs/弱视诊治指南.pdf -> /data/output/弱视诊治指南.pdf.txt\n",
"[OK] /data/docs/最新文献摘要.pdf -> /data/output/最新文献摘要.pdf.txt\n",
"[OK] /data/docs/温医残留性弱视临床研究SCI论文.pdf -> /data/output/温医残留性弱视临床研究SCI论文.pdf.txt\n",
"[OK] /data/docs/省立三院筛查后三色近视防控体系解读.pptx -> /data/output/省立三院筛查后三色近视防控体系解读.pptx.txt\n",
"[OK] /data/docs/脑机时代视域下的范式转移:视焱脑科学重塑中国眼视光行业.pptx -> /data/output/脑机时代视域下的范式转移:视焱脑科学重塑中国眼视光行业.pptx.txt\n",
"[OK] /data/docs/重复低强度红光照射辅助治疗儿童青少年近视专家共识2022-中华实验眼科杂志2022年7月第40卷第7期(2)(2)(1).pdf -> /data/output/重复低强度红光照射辅助治疗儿童青少年近视专家共识2022-中华实验眼科杂志2022年7月第40卷第7期(2)(2)(1).pdf.txt\n",
"[OK] /data/docs/黄昌兵弱视论文汇编.pdf -> /data/output/黄昌兵弱视论文汇编.pdf.txt\n",
"[OK] /data/docs/黄昌兵文献.pdf -> /data/output/黄昌兵文献.pdf.txt\n",
"[OK] /data/docs/黄昌兵,周佳玮和侯方的文献整理.docx -> /data/output/黄昌兵,周佳玮和侯方的文献整理.docx.txt\n",
"生成txt数量 23\n"
]
}
],
"source": [
"#批量OCR\n",
"input_dir = \"/data/docs/\"\n",
"output_dir = \"/data/output/\"\n",
"save_images = \"/data/output/image\" # 或者 True / False\n",
"\n",
"outs = extract_folder_to_txts(input_dir, output_dir=output_dir, save_images=save_images)\n",
"print(\"生成txt数量\", len(outs))\n"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "ddd44d5a-3a33-4186-a756-94c9e5ab6d0d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DEBUG type(data): <class 'alibabacloud_ocr_api20210707.models.RecognizeAllTextResponseBodyData'>\n",
"DEBUG data keys: {'Content': 'Visual Acuity Tests Using Chart Line and Single Optotype in Healthy and Amblyopic Children Yair Morad MD. Eitan Werker. MD and Pinhas NemetMD ', 'Height': 354, 'SubImageCount': 1, 'S\n",
"DEBUG type(data): <class 'alibabacloud_ocr_api20210707.models.RecognizeAllTextResponseBodyData'>\n",
"DEBUG data keys: {'Content': 'Perceptual Learning Improves Visual Performance in Juvenile Amblyopia Roger W.Li 21 Karen G. Young Pia Hoenig and Dennis M. Levi ', 'Height': 174, 'SubImageCount': 1, 'SubImages': [{'\n",
"DEBUG type(data): <class 'alibabacloud_ocr_api20210707.models.RecognizeAllTextResponseBodyData'>\n",
"DEBUG data keys: {'Content': 'Journal of Vision (2009)9(11)241-14 http/joumalofvision.org/9/11/24/ 1 Mechanisms underlyingI perceptual learning of contrast detection in adults with anisometropic amblyopia Laborator\n",
"DEBUG type(data): <class 'alibabacloud_ocr_api20210707.models.RecognizeAllTextResponseBodyData'>\n",
"DEBUG data keys: {'Content': 'Journal of Vision (2009)9(3)171-16 http//journalofvisionorg/9/3/17/ 1 Binocular combination in anisometropic amblyopia Laboratory of Brain Processes ( (LOBES). Department of Psychology\n",
"DEBUG type(data): <class 'alibabacloud_ocr_api20210707.models.RecognizeAllTextResponseBodyData'>\n",
"DEBUG data keys: {'Content': 'Broad bandwidth of perceptual learning in the visual system of adults with anisometropic amblyopia SVNd Chang-Bing Huang*+ Yifeng Zhou* and Zhong-Lin Lu++ *Vision Research Laboratory S\n",
"DEBUG type(data): <class 'alibabacloud_ocr_api20210707.models.RecognizeAllTextResponseBodyData'>\n",
"DEBUG data keys: {'Content': '屈光参差性弱视的知觉机制及知觉学习 吕忠林 黄 黄昌兵周逸峰 【摘要】弱视是一种由视觉系统发育障碍所引起且不能通过光学途径加以矫正的视力缺陷。尽管通常认为弱视反映了发 育过程中异常视觉经验引起的视觉皮层功能异常,但其神经机制至今未被完全阐明。本文结合我们最新的研究综述了弱视损害 的知觉机制,并探讨通过知觉训练改善弱视患者视功能的可能性。成人弱视的视觉系统仍存在一定的可塑性\n",
"DEBUG type(data): <class 'alibabacloud_ocr_api20210707.models.RecognizeAllTextResponseBodyData'>\n",
"DEBUG data keys: {'Content': 'Available online at www.sciencedirect.com ScienceDirect Vision Research ELSEVIER Vision Research 47 (2007) 22-34 www.elsevier.com/locate/visres Treated amblyopes remain deficient in spati\n",
"DEBUG type(data): <class 'alibabacloud_ocr_api20210707.models.RecognizeAllTextResponseBodyData'>\n",
"DEBUG data keys: {'Content': 'Available online at www.sciencedirect.com BCENC- DIRECTO Vision Research n ELSEVIER Vision Research 46 (2006) 739-750 www.elsevier.com/locate/visres Perceptual learning improves contrast \n",
"DEBUG type(data): <class 'alibabacloud_ocr_api20210707.models.RecognizeAllTextResponseBodyData'>\n",
"DEBUG data keys: {'Content': 'Visual Psychophysics and Physiological Optics Training in Contrast Detection Improves Motion Perception of Sinewave Gratings in Amblyopia Fang Hou Cbang-bing Huang²Liming TaoLixia Feng\n",
"DEBUG type(data): <class 'alibabacloud_ocr_api20210707.models.RecognizeAllTextResponseBodyData'>\n",
"DEBUG data keys: {'Content': 'VISION RESEARCH Vision Research Volume 43 Issue 6 March 2003 Pages 729-738 ELSEVIER Deficits to global motion np processing in human amblyopia Anita J Simmers a区 Tim Ledgeway b Robert\n",
"DEBUG type(data): <class 'alibabacloud_ocr_api20210707.models.RecognizeAllTextResponseBodyData'>\n",
"DEBUG data keys: {'Content': '弱视的立体视知觉学习 席洁,贾武力,封利霞,吕忠林,黄昌兵 目的:弱视是导致单眼和双眼视力降低的一种常见眼科疾病。弱视的传统治疗方式有光学矫 正和\"遮盖\"疗法,这些方法能有效的恢复弱视患者的单眼视锐度,但是对双眼视觉功能特别 是立体视的恢复并不理想。本研究考察了知觉学习对弱视患者立体视功能恢复的可能作用。 方法研究被试为11名屈光参差性或屈光不正性弱视被试(21.1±5\n"
]
}
],
"source": [
"#单文件OCR\n",
"file_path = \"/data/docs/c0e7523d3b5e27c903eb8748a475520c.docx\" \n",
"output_dir = \"/data/output/\"\n",
"save_images = \"/data/output/image\"\n",
"out_txt = extract_single_file_to_txt(file_path, output_dir=output_dir, save_images=save_images)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "OCR",
"language": "python",
"name": "ocr"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}