file-type

数据仓库上游系统调研与贴源层分析

下载需积分: 50 | 2.4MB | 更新于2024-09-05 | 3 浏览量 | 3 评论 | 13 下载量 举报 收藏
download 立即下载
"该文档是2019年8月28日关于数据仓库贴源层的演示,主要内容涉及数据仓库入仓表的分析规则,上游系统现状的调研,以及如何选择适合入仓的业务表。文档提到了缓冲存储层的作用,数据质量检验的策略,以及入仓系统的初步筛选标准。" 数据仓库是存储和管理大量结构化数据以支持业务智能、数据分析和决策制定的系统。在数据仓库的设计中,贴源层或近源层扮演着重要角色,它位于数据源和数据仓库的其他层次之间,旨在提供一个可供查询的、快速访问的数据副本,同时保护原始数据源不受频繁读取和误操作的影响。 贴源层的设计目的是减少对上游系统的性能负载,允许数据重跑,并且通常保持与源系统相似的表结构,添加额外的信息如源系统标识、ETL日期和供数方式。在数据质量检验方面,包括成功标识、系统名校验、表校验、字段对比、数据量、分区和历史缓慢变化维的检查,确保数据的准确性和一致性。 文档中提到的首批九个入仓系统名单,以及后续的约41个入仓系统的选择过程,展示了数据仓库构建过程中严谨的系统评估。在选择入仓系统时,需要考虑系统的字典版本、数据库连接信息、系统是否仍在运行、数据字段的用途和必要性等因素。例如,字段级的分析排除了无意义的字段(如空字段、固定值字段、加载时间等)、业务上不使用的长文本、流程控制字段、中间计算结果、未启用字段、冗余字段和非结构化数据。 对于上游系统表的业务含义和下游用途的理解至关重要。数据仓库系统定位为提供服务给各类内部应用、OLAP分析、BI系统,重点关注那些关键业务数据、保留粒度较细的表,如信息表、业务明细、交易流水、映射关系表、维度表和参数代码类型表。同时,需要排除内部控制、业务流程控制表,以及中间的、临时的、备份的、冗余的、预留的数据,以及无活动的表。 规划的下游系统模型主题涵盖了金融行业的多个领域,包括监管报告、风险管理、对账、营销、客户关系管理、绩效考核等多个业务场景,表明数据仓库在银行业务中的核心地位,为各类业务决策提供数据支持。 这份演示文稿详细阐述了数据仓库贴源层的设计原则和上游系统调研的关键点,提供了选择合适入仓数据的框架,为构建高效、稳定的数据仓库体系提供了指导。

相关推荐

filetype
filetype

from pptx import Presentation def merge_ppts(source_ppts, target_ppt_path): target_ppt = Presentation() for source_ppt_path in source_ppts: source_ppt = Presentation(source_ppt_path) for slide in source_ppt.slides: slide_layout = target_ppt.slide_layouts[source_ppt.slide_layouts.index(slide.slide_layout)] new_slide = target_ppt.slides.add_slide(slide_layout) for shape in slide.shapes: if shape.has_text_frame: new_shape = new_slide.shapes.add_textbox(shape.left, shape.top, shape.width, shape.height) new_text_frame = new_shape.text_frame new_text_frame.clear() for paragraph in shape.text_frame.paragraphs: new_paragraph = new_text_frame.add_paragraph() for run in paragraph.runs: new_run = new_paragraph.add_run() new_run.text = run.text new_run.font.name = run.font.name new_run.font.size = run.font.size new_run.font.bold = run.font.bold new_run.font.italic = run.font.italic new_run.font.color.rgb = run.font.color.rgb elif shape.has_table: table = shape.table new_table = new_slide.shapes.add_table(table.rows.__len__(), table.columns.__len__(), shape.left, shape.top, shape.width, shape.height).table for i in range(table.rows.__len__()): for j in range(table.columns.__len__()): new_table.cell(i, j).text = table.cell(i, j).text elif shape.has_chart: chart = shape.chart chart_data = chart.chart_data new_chart = new_slide.shapes.add_chart(chart.chart_type, shape.left, shape.top, shape.width, shape.height, chart_data).chart target_ppt.save(target_ppt_path) # 使用示例 source_ppts = ['ppt1.pptx', 'ppt2.pptx', 'ppt3.pptx'] target_ppt_path = 'merged.pptx' merge_ppts(source_ppts, target_ppt_path) 判断是否为文本时,标题除外

filetype

import os import time import win32com.client import win32gui import win32con from pythoncom import CoInitialize, CoUninitialize # 显式导入COM函数 import threading import psutil import pyautogui from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.action_chains import ActionChains # 移除独立的日志配置,使用 app.py 的日志系统 class PPTController: """控制PPT的全屏播放和翻页""" _is_active = False # 类属性,标记是否有活跃的PPT _is_ready = False # 类属性,标记PPT是否已经准备好(即已经全屏并开始放映) _presentation = None # 当前打开的演示文稿对象 _application = None # PowerPoint应用程序对象 _lock = threading.Lock() # 添加线程锁 _logger = None # 日志记录器 _com_initialized = False # 标记COM是否已初始化 @classmethod def set_logger(cls, logger): """设置日志记录器""" cls._logger = logger @classmethod def open_fullscreen(cls, ppt_path): """打开PPT并全屏播放(新增循环播放功能)""" # 如果已经有打开的PPT,先关闭 if cls._is_active: cls.close() try: with cls._lock: # 使用线程锁确保线程安全 # 初始化 COM try: CoInitialize() cls._com_initialized = True if cls._logger: cls._logger.debug("COM初始化成功") except Exception as e: if cls._logger: cls._logger.error(f"COM初始化失败: {str(e)}") return False, f"COM初始化失败: {str(e)}" # 检查文件是否存在 if not os.path.exists(ppt_path): if cls._logger: cls._logger.error(f"PPT文件不存在: {ppt_path}") # 清理COM资源 cls._safe_uninitialize() return False, "PPT文件不存在" # 尝试使用PowerPoint,如果失败则尝试WPS try: cls._application = win32com.client.Dispatch("PowerPoint.Application") if cls._logger: cls._logger.info("使用Microsoft PowerPoint打开文件") is_powerpoint = True # 标记是否为PowerPoint except Exception as e: if cls._logger: cls._logger.warning(f"无法启动PowerPoint, 尝试WPS: {str(e)}") try: cls._application = win32com.client.Dispatch("KWPP.Application") if cls._logger: cls._logger.info("使用WPS演示打开文件") is_powerpoint = False # 标记为WPS except Exception as e2: if cls._logger: cls._logger.error(f"无法启动WPS: {str(e2)}") # 清理资源 cls._safe_uninitialize() return False, f"无法启动PowerPoint或WPS: {str(e)}" cls._application.Visible = True # 打开演示文稿 try: cls._presentation = cls._application.Presentations.Open(ppt_path, WithWindow=True) except Exception as e: if cls._logger: cls._logger.error(f"打开PPT文件失败: {str(e)}") # 清理资源 cls._cleanup_resources() return False, f"打开PPT文件失败: {str(e)}" # 全屏放映(新增循环设置) try: slide_show_settings = cls._presentation.SlideShowSettings # 设置循环播放:PowerPoint和WPS的属性名不同,需分别处理 if is_powerpoint: # PowerPoint中,LoopUntilStopped=True 表示循环播放直到手动停止 slide_show_settings.LoopUntilStopped = True else: # WPS中,Loop=True 表示循环播放 slide_show_settings.Loop = True # 启动放映 slide_show_settings.Run() except Exception as e: if cls._logger: cls._logger.error(f"启动幻灯片放映失败: {str(e)}") # 清理资源 cls._cleanup_resources() return False, f"启动幻灯片放映失败: {str(e)}" # 最大化并置顶窗口 time.sleep(1) # 等待窗口创建 cls.maximize_window() cls._is_active = True cls._is_ready = True if cls._logger: cls._logger.info(f"PPT已全屏循环播放: {ppt_path}") return True, "PPT已全屏循环播放" except Exception as e: # 异常时清理资源 cls._cleanup_resources() if cls._logger: cls._logger.error(f"打开PPT失败: {str(e)}") return False, f"打开PPT失败: {str(e)}" @classmethod def _cleanup_resources(cls): """清理PPT相关资源""" if cls._presentation: try: cls._presentation.Close() cls._presentation = None except: pass if cls._application: try: cls._application.Quit() cls._application = None except: pass # 反初始化COM cls._safe_uninitialize() cls._is_active = False cls._is_ready = False @classmethod def _safe_uninitialize(cls): """安全地反初始化COM""" if cls._com_initialized: try: CoUninitialize() cls._com_initialized = False if cls._logger: cls._logger.debug("已反初始化COM") except Exception as e: if cls._logger: cls._logger.warning(f"反初始化COM失败: {str(e)}") else: if cls._logger: cls._logger.debug("COM未初始化,无需反初始化") @classmethod def maximize_window(cls): if not cls._presentation: return False try: def enum_windows_callback(hwnd, _): window_title = win32gui.GetWindowText(hwnd) if "PowerPoint Slide Show" in window_title or "WPS 演示" in window_title: # 先最大化窗口 win32gui.ShowWindow(hwnd, win32con.SW_MAXIMIZE) # 再设置窗口置顶(HWND_TOPMOST表示置顶,0,0,0,0,SWP_NOMOVE|SWP_NOSIZE表示不改变位置和大小) win32gui.SetWindowPos( hwnd, win32con.HWND_TOPMOST, # 置顶标记 0, 0, 0, 0, win32con.SWP_NOMOVE | win32con.SWP_NOSIZE # 不改变位置和大小 ) if cls._logger: cls._logger.info(f"最大化并置顶窗口: {window_title}") win32gui.EnumWindows(enum_windows_callback, None) return True except Exception as e: if cls._logger: cls._logger.warning(f"最大化并置顶窗口失败: {str(e)}") return False @classmethod def navigate(cls, direction): """翻页操作:通过模拟键盘上下键实现(核心修改)""" if not cls._is_active or not cls._application or not cls._presentation: return False, "没有活跃的PPT或对象已失效" if not cls._is_ready: return False, "PPT正在加载中,请稍后" try: with cls._lock: # 检查放映窗口是否存在(确保PPT处于放映状态) if not hasattr(cls._presentation, 'SlideShowWindow') or cls._presentation.SlideShowWindow is None: return False, "幻灯片放映已终止,请重新打开" # 模拟键盘按键(上箭头=上一页,下箭头=下一页) if direction == 'next': pyautogui.press('down') # 模拟下箭头键 if cls._logger: cls._logger.info("模拟下箭头键,切换到下一页") return True, "已模拟下箭头键,切换到下一页" elif direction == 'previous': pyautogui.press('up') # 模拟上箭头键 if cls._logger: cls._logger.info("模拟上箭头键,切换到上一页") return True, "已模拟上箭头键,切换到上一页" else: return False, "无效的翻页方向" except Exception as e: if cls._logger: cls._logger.error(f"翻页失败: {str(e)}") return False, f"翻页失败: {str(e)}" @classmethod def close(cls): """关闭当前打开的PPT""" with cls._lock: # 使用线程锁确保线程安全 cls._cleanup_resources() if cls._logger: cls._logger.info("PPT已关闭") @classmethod def is_active(cls): """检查是否有活跃的PPT""" return cls._is_active @classmethod def is_ready(cls): """检查PPT是否准备好(可翻页)""" return cls._is_ready class WebController: """控制网页的全屏显示""" _driver = None # WebDriver实例 _lock = threading.Lock() # 添加线程锁 _logger = None # 日志记录器 @classmethod def set_logger(cls, logger): """设置日志记录器""" cls._logger = logger @classmethod def open_fullscreen(cls, url, browser_type='chrome'): """ 在浏览器中全屏打开网页 :param url: 要打开的URL :param browser_type: 浏览器类型,支持 'chrome', 'edge', 'firefox' :return: (成功与否, 消息) """ with cls._lock: # 使用线程锁确保线程安全 # 关闭已存在的浏览器实例 if cls._driver: try: cls._driver.quit() except: pass cls._driver = None try: # 根据浏览器类型创建driver if browser_type.lower() in ['chrome', 'googlechrome']: chrome_options = Options() chrome_options.add_argument("--kiosk") # 全屏模式 chrome_options.add_argument("--disable-infobars") chrome_options.add_experimental_option("useAutomationExtension", False) chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) # 设置WebDriver服务 service = Service(executable_path='chromedriver.exe') cls._driver = webdriver.Chrome(service=service, options=chrome_options) elif browser_type.lower() in ['edge', 'microsoftedge']: edge_options = Options() edge_options.add_argument("--kiosk") # 全屏模式 edge_options.add_argument("--disable-infobars") # 设置WebDriver服务 service = Service(executable_path='msedgedriver.exe') cls._driver = webdriver.Edge(service=service, options=edge_options) elif browser_type.lower() in ['firefox', 'mozilla']: firefox_options = webdriver.FirefoxOptions() firefox_options.add_argument("--kiosk") # 全屏模式 # 设置WebDriver服务 service = Service(executable_path='geckodriver.exe') cls._driver = webdriver.Firefox(service=service, options=firefox_options) else: if cls._logger: cls._logger.error(f"不支持的浏览器类型: {browser_type}") return False, f"不支持的浏览器类型: {browser_type}" # 打开URL cls._driver.get(url) # 确保全屏 try: # 尝试按F11实现全屏(某些浏览器需要) ActionChains(cls._driver).key_down(Keys.F11).perform() time.sleep(0.5) # 等待全屏生效 except: pass if cls._logger: cls._logger.info(f"已在{browser_type}中全屏打开: {url}") return True, f"已在{browser_type}中全屏打开" except Exception as e: if cls._logger: cls._logger.error(f"打开网页失败: {str(e)}") return False, f"打开网页失败: {str(e)}" @classmethod def close(cls): """关闭浏览器""" with cls._lock: # 使用线程锁确保线程安全 if cls._driver: try: cls._driver.quit() cls._driver = None if cls._logger: cls._logger.info("浏览器已关闭") return True except Exception as e: if cls._logger: cls._logger.error(f"关闭浏览器失败: {str(e)}") return False return True def kill_process_by_name(process_names, logger=None): """根据进程名杀死进程""" for proc in psutil.process_iter(['name']): try: if proc.info['name'] in process_names: proc.kill() if logger: logger.info(f"已结束进程: {proc.info['name']}") except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess) as e: if logger: logger.warning(f"无法结束进程: {e}") except Exception as e: if logger: logger.error(f"结束进程时出错: {e}") def cleanup_processes(logger=None): """清理可能残留的进程""" # 清理PPT相关进程 kill_process_by_name([ "POWERPNT.EXE", # PowerPoint "wpp.exe", # WPS演示 "et.exe", # WPS表格 "wps.exe", # WPS文字 "chrome.exe", # Chrome "msedge.exe", # Edge "firefox.exe" # Firefox ], logger=logger) 如果 不使用pycom 有没有办法控制ppt

filetype

--------------------------------------------------------------------------- PackageNotFoundError Traceback (most recent call last) Cell In[1], line 30 27 new_prs.save(output_path) 29 # 使用示例 ---> 30 keep_first_slide("原始文件.pptx", "仅第一页.pptx") Cell In[1], line 5, in keep_first_slide(input_path, output_path) 3 def keep_first_slide(input_path, output_path): 4 # 打开原始PPT ----> 5 prs = Presentation(input_path) 7 # 创建新演示文稿 8 new_prs = Presentation() File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\pptx\api.py:31, in Presentation(pptx) 28 if pptx is None: 29 pptx = _default_pptx_path() ---> 31 presentation_part = Package.open(pptx).main_document_part 33 if not _is_pptx_package(presentation_part): 34 tmpl = "file '%s' is not a PowerPoint file, content type is '%s'" File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\pptx\opc\package.py:82, in OpcPackage.open(cls, pkg_file) 79 @classmethod 80 def open(cls, pkg_file: str | IO[bytes]) -> Self: 81 """Return an |OpcPackage| instance loaded with the contents of `pkg_file`.""" ---> 82 return cls(pkg_file)._load() File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\pptx\opc\package.py:160, in OpcPackage._load(self) 158 def _load(self) -> Self: 159 """Return the package after loading all parts and relationships.""" --> 160 pkg_xml_rels, parts = _PackageLoader.load(self._pkg_file, cast("Package", self)) 161 self._rels.load_from_xml(PACKAGE_URI, pkg_xml_rels, parts) 162 return self File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\pptx\opc\package.py:190, in _PackageLoader.load(cls, pkg_file, package) 177 @classmethod 178 def load( 179 cls, pkg_file: str | IO[bytes], package: Package 180 ) -> tuple[CT_Relationships, dict[PackURI, Part]]: 181 """Return (pkg_xml_rels, parts) pair resulting from loading `pkg_file`. 182 183 The returned `parts` value is a {partname: part} mapping with each part in the package (...) 188 those relationships into its |_Relationships| object. 189 """ --> 190 return cls(pkg_file, package)._load() File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\pptx\opc\package.py:194, in _PackageLoader._load(self) 192 def _load(self) -> tuple[CT_Relationships, dict[PackURI, Part]]: 193 """Return (pkg_xml_rels, parts) pair resulting from loading pkg_file.""" --> 194 parts, xml_rels = self._parts, self._xml_rels 196 for partname, part in parts.items(): 197 part.load_rels_from_xml(xml_rels[partname], parts) File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\pptx\util.py:191, in lazyproperty.__get__(self, obj, type) 186 value = obj.__dict__.get(self._name) 187 if value is None: 188 # --- on first access, the __dict__ item will be absent. Evaluate fget() 189 # --- and store that value in the (otherwise unused) host-object 190 # --- __dict__ value of same name ('fget' nominally) --> 191 value = self._fget(obj) 192 obj.__dict__[self._name] = value 193 return cast(_T, value) File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\pptx\opc\package.py:222, in _PackageLoader._parts(self) 214 @lazyproperty 215 def _parts(self) -> dict[PackURI, Part]: 216 """dict {partname: Part} populated with parts loading from package. 217 218 Among other duties, this collection is passed to each relationships collection so each 219 relationship can resolve a reference to its target part when required. This reference can 220 only be reliably carried out once the all parts have been loaded. 221 """ --> 222 content_types = self._content_types 223 package = self._package 224 package_reader = self._package_reader File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\pptx\util.py:191, in lazyproperty.__get__(self, obj, type) 186 value = obj.__dict__.get(self._name) 187 if value is None: 188 # --- on first access, the __dict__ item will be absent. Evaluate fget() 189 # --- and store that value in the (otherwise unused) host-object 190 # --- __dict__ value of same name ('fget' nominally) --> 191 value = self._fget(obj) 192 obj.__dict__[self._name] = value 193 return cast(_T, value) File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\pptx\opc\package.py:207, in _PackageLoader._content_types(self) 201 @lazyproperty 202 def _content_types(self) -> _ContentTypeMap: 203 """|_ContentTypeMap| object providing content-types for items of this package. 204 205 Provides a content-type (MIME-type) for any given partname. 206 """ --> 207 return _ContentTypeMap.from_xml(self._package_reader[CONTENT_TYPES_URI]) File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\pptx\opc\serialized.py:38, in PackageReader.__getitem__(self, pack_uri) 36 def __getitem__(self, pack_uri: PackURI) -> bytes: 37 """Return bytes for part corresponding to `pack_uri`.""" ---> 38 return self._blob_reader[pack_uri] File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\pptx\util.py:191, in lazyproperty.__get__(self, obj, type) 186 value = obj.__dict__.get(self._name) 187 if value is None: 188 # --- on first access, the __dict__ item will be absent. Evaluate fget() 189 # --- and store that value in the (otherwise unused) host-object 190 # --- __dict__ value of same name ('fget' nominally) --> 191 value = self._fget(obj) 192 obj.__dict__[self._name] = value 193 return cast(_T, value) File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\pptx\opc\serialized.py:52, in PackageReader._blob_reader(self) 49 @lazyproperty 50 def _blob_reader(self) -> _PhysPkgReader: 51 """|_PhysPkgReader| subtype providing read access to the package file.""" ---> 52 return _PhysPkgReader.factory(self._pkg_file) File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\pptx\opc\serialized.py:144, in _PhysPkgReader.factory(cls, pkg_file) 141 if zipfile.is_zipfile(pkg_file): 142 return _ZipPkgReader(pkg_file) --> 144 raise PackageNotFoundError("Package not found at '%s'" % pkg_file) PackageNotFoundError: Package not found at '原始文件.pptx'

filetype

from pptx import Presentation from pptx.util import Inches, Pt from pptx.enum.text import PP_ALIGN from pptx.dml.color import RGBColor # 创建PPT prs = Presentation() prs.slide_width = Inches(13.33) # 16:9 prs.slide_height = Inches(7.5) # 标题页 slide = prs.slides.add_slide(prs.slide_layouts[6]) title_box = slide.shapes.add_textbox(Inches(1), Inches(2.5), Inches(11), Inches(1)) title_frame = title_box.text_frame title_frame.text = "骨折影像诊断与报告规范" title_para = title_frame.paragraphs[0] title_para.font.size = Pt(44) title_para.font.bold = True title_para.alignment = PP_ALIGN.CENTER sub_box = slide.shapes.add_textbox(Inches(3), Inches(3.5), Inches(7), Inches(1)) sub_frame = sub_box.text_frame sub_frame.text = "2025年版" sub_para = sub_frame.paragraphs[0] sub_para.font.size = Pt(28) sub_para.alignment = PP_ALIGN.CENTER # 目录页 slide = prs.slides.add_slide(prs.slide_layouts[6]) title_box = slide.shapes.add_textbox(Inches(1), Inches(0.5), Inches(11), Inches(0.8)) title_frame = title_box.text_frame title_frame.text = "目录" title_para = title_frame.paragraphs[0] title_para.font.size = Pt(36) title_para.font.bold = True content = [ "1. 骨折的基本概念与分类", "2. 常用影像检查方法", "3. 常见骨折类型与典型征象", "4. 骨折愈合过程", "5. 特殊骨折与易漏诊部位", "6. 国内外指南要点", "7. 报告书写规范与模板", "8. 病例讨论", "9. 考核与答疑" ] for i, text in enumerate(content): box = slide.shapes.add_textbox(Inches(1.5), Inches(1.5 + i*0.6), Inches(10), Inches(0.5)) frame = box.text_frame frame.text = text para = frame.paragraphs[0] para.font.size = Pt(24) # 其他页面可按同样方式添加... # 保存PPTX prs.save("骨折影像诊断与报告规范.pptx")

资源评论
用户头像
蟹蛛
2025.06.07
该演示文件详细介绍了数据仓库的入仓表分析规则以及上游系统的现状调研。🍘
用户头像
田仲政
2025.05.05
贴源层的上游调研和分析,为数据仓库的高效入仓奠定了基础。
用户头像
章满莫
2025.03.29
对于从事数据仓库工作的专业人士来说,这是份不可多得的参考资料。
tot286969
  • 粉丝: 0
上传资源 快速赚钱