file-type

JAVA创始人James Gosling:Java语言发展与应用详解

下载需积分: 18 | 1.87MB | 更新于2024-08-18 | 131 浏览量 | 0 下载量 举报 收藏
download 立即下载
本资源主要介绍了JAVA基础教程,由知名创始人James Gosling设计的Java语言及其发展历史。James Gosling不仅是SUN公司的副总裁和院士,也是Java语言的亲力打造者,他开发了Java编译器和Java虚拟机,推动了Java成为全球最受欢迎的开发语言。 课程内容涵盖了Java开发的基础要素,包括: 1. **教师自我介绍**:虽然具体内容未提供,但可能涉及教师的专业背景和教学风格介绍。 2. **课程概述**:引导学员进入Java编程世界,从开发环境安装到基本概念如标示符、关键字和数据类型。 3. **表达式和流程控制**:讲解如何编写简单的逻辑语句,如条件判断(如if-else语句)。 4. **数组**:介绍数组在Java中的使用,是数据存储和操作的重要部分。 5. **常用数据结构和算法**:教授基本的数据结构(如数组、链表、栈和队列),以及它们在算法设计中的应用。 6. **面向对象编程(OOP)**:讲解封装、继承和多态等面向对象的核心概念。 7. **异常处理(Exception)**:指导学员理解和使用Java的异常处理机制,提高代码的健壮性。 8. **程序语言发展史**:对比机器语言、汇编语言和高级语言的特点,强调Java作为高级语言的易读性和移植性。 9. **Java起源**:讲述Java的诞生背景,包括Stanford University Network的影响和名称的由来。 10. **Java历史**:详述Java从1995年至2006年的关键版本发布,反映了其技术演进和市场定位的变化。 课程目标明确,旨在让学员掌握程序设计的基本技能,包括编程步骤、Java应用程序的编译与运行、利用Java帮助文档、理解语言语法和结构、以及面向对象编程的运用和异常处理。 通过学习,学员将能够理解Java语言的历史、特点和在不同领域的应用,并具备开发跨平台应用的能力。同时,了解Java语言的发展历程,对于深入理解其设计理念和技术变迁也非常重要。

相关推荐

filetype

from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification from transformers import TrainingArguments, Trainer import evaluate from datasets import load_dataset import numpy as np from seqeval.metrics import classification_report, f1_score, precision_score, recall_score import torch import os import json import logging from collections import Counter from sklearn.utils.class_weight import compute_class_weight import matplotlib.pyplot as plt # 设置详细日志 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # 设置设备 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logger.info(f"Using device: {device}") # 模型路径 model_dir = 'C:/Users/admin/.cache/modelscope/hub/tiansz/bert-base-chinese' # 加载tokenizer tokenizer = AutoTokenizer.from_pretrained(model_dir) logger.info("Tokenizer加载完成") # 加载数据集 path = r'C:\Users\admin\八斗学院学习笔记\练习\week11_codes\data' dataset = load_dataset(path) logger.info(f"数据集加载完成: {dataset}") # 提取所有实体标签 labels = set() for row in dataset['train']: for ent in row['entities']: labels.add(ent['label']) # 创建完整的标签列表(包含B-和I-前缀) entities = sorted(list(labels)) # 排序确保一致性 tags = ['O'] for ent in entities: tags.append('B-' + ent.upper()) tags.append('I-' + ent.upper()) # 创建标签到ID的映射 tag2id = {tag: i for i, tag in enumerate(tags)} id2tag = {i: tag for i, tag in enumerate(tags)} logger.info(f"标签数量: {len(tags)}") logger.info(f"标签列表: {tags}") # 保存标签映射 os.makedirs('label_mappings', exist_ok=True) with open('label_mappings/tag2id.json', 'w', encoding='utf-8') as f: json.dump(tag2id, f, ensure_ascii=False, indent=2) with open('label_mappings/id2tag.json', 'w', encoding='utf-8') as f: json.dump(id2tag, f, ensure_ascii=False, indent=2) logger.info("标签映射已保存到 label_mappings 目录") # 实体处理函数(修复版)- 确保标签正确性 def entities_proc(items): text = items['text'] items_len = len(text) ent_tags = ['O'] * items_len # 初始化为'O' entities = items['entities'] for ent in entities: start = ent['start_offset'] end = ent['end_offset'] label = ent['label'].upper() # 统一转换为大写 # 确保位置在范围内 if start < items_len: # 设置B-标签 ent_tags[start] = f'B-{label}' # 设置I-标签(如果有多个字符) for pos in range(start + 1, min(end, items_len)): ent_tags[pos] = f'I-{label}' # 转换为ID(遇到未知标签默认为'O') tag_ids = [tag2id.get(tag, tag2id['O']) for tag in ent_tags] return {'ent_tags': tag_ids} # 应用实体处理 logger.info("处理实体标签...") ds = dataset.map(entities_proc) # 分析标签分布 def analyze_label_distribution(dataset, split): logger.info(f"\n分析 {split} 标签分布...") all_labels = [] for example in dataset[split]: all_labels.extend([id2tag.get(tag_id, 'O') for tag_id in example['ent_tags']]) label_counts = Counter(all_labels) total_labels = len(all_labels) logger.info(f"总标签数: {total_labels}") for label, count in label_counts.items(): percentage = (count / total_labels) * 100 logger.info(f"{label}: {count} ({percentage:.2f}%)") # 计算实体标签的比例 entity_labels = [label for label in label_counts.keys() if label != 'O'] entity_count = sum([label_counts[label] for label in entity_labels]) entity_percentage = (entity_count / total_labels) * 100 logger.info(f"实体标签总数: {entity_count} ({entity_percentage:.2f}%)") logger.info(f"非实体标签(O): {label_counts['O']} ({100-entity_percentage:.2f}%)") # 可视化标签分布 plt.figure(figsize=(12, 6)) labels_list = list(label_counts.keys()) counts = [label_counts[l] for l in labels_list] plt.bar(labels_list, counts) plt.xticks(rotation=90) plt.title(f'{split} 标签分布') plt.ylabel('数量') plt.tight_layout() plt.savefig(f'label_distribution_{split}.png') logger.info(f"标签分布图已保存为 label_distribution_{split}.png") return label_counts # 分析标签分布 train_label_counts = analyze_label_distribution(ds, 'train') test_label_counts = analyze_label_distribution(ds, 'test') # 计算类别权重解决不平衡问题 logger.info("\n计算类别权重解决不平衡问题...") all_train_labels = [] for example in ds['train']: all_train_labels.extend(example['ent_tags']) # 计算类别权重 class_weights = compute_class_weight( class_weight='balanced', classes=np.unique(all_train_labels), y=all_train_labels ) class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device) # 增强实体标签的权重 for i, tag in enumerate(tags): if tag != 'O': class_weights[i] *= 5.0 # 增加实体标签的权重 logger.info(f"类别权重: {class_weights}") # 数据输入处理函数(修复版 - 使用offset_mapping) def data_input_proc(items): # 获取文本列表 texts = items['text'] # 分词(启用offset_mapping) tokenized_inputs = tokenizer( texts, truncation=True, padding=True, max_length=128, return_offsets_mapping=True, is_split_into_words=False, return_tensors="pt", ) # 获取偏移量映射 offset_mappings = tokenized_inputs.pop('offset_mapping') all_labels = [] for i, text in enumerate(texts): # 获取字符级标签ID列表 char_tags = items['ent_tags'][i] label_ids = [] offsets = offset_mappings[i] for j, offset in enumerate(offsets): start, end = offset # 特殊token ([CLS], [SEP], [PAD]) 设置为-100 if start == 0 and end == 0: label_ids.append(-100) else: # 取起始位置对应的字符标签 if start < len(char_tags): label_ids.append(char_tags[start]) else: # 处理截断情况 label_ids.append(-100) all_labels.append(label_ids) tokenized_inputs["labels"] = all_labels return tokenized_inputs # 应用输入处理 logger.info("处理输入数据...") ds1 = ds.map( data_input_proc, batched=True, batch_size=8, remove_columns=ds["train"].column_names ) # 训练参数 args = TrainingArguments( output_dir='ner_train', num_train_epochs=3, per_device_train_batch_size=4, per_device_eval_batch_size=8, evaluation_strategy="epoch", save_strategy="epoch", logging_strategy="epoch", load_best_model_at_end=False, # 暂时禁用,直到我们解决指标问题 report_to='tensorboard', logging_dir='ner_train/logs', save_total_limit=2, learning_rate=5e-5, weight_decay=0.01, overwrite_output_dir=True, warmup_ratio=0.1, gradient_accumulation_steps=4, fp16=torch.cuda.is_available(), logging_steps=20, remove_unused_columns=False, # 暂时不使用 metric_for_best_model ) # 创建带类别权重的自定义损失函数 class WeightedLossModel(torch.nn.Module): def __init__(self, model, class_weights): super().__init__() self.model = model self.class_weights = class_weights self.loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=-100) def forward(self, **inputs): outputs = self.model(**inputs) logits = outputs.logits if "labels" in inputs: loss = self.loss_fct(logits.view(-1, self.model.config.num_labels), inputs["labels"].view(-1)) outputs.loss = loss return outputs # 加载基础模型 logger.info("加载基础模型...") base_model = AutoModelForTokenClassification.from_pretrained( model_dir, num_labels=len(tags), id2label=id2tag, label2id=tag2id, ignore_mismatched_sizes=True ).to(device) # 创建带权重的模型 model = WeightedLossModel(base_model, class_weights).to(device) logger.info(f"模型结构: 带类别权重的{base_model.__class__.__name__}") logger.info(f"模型参数量: {sum(p.numel() for p in model.parameters()):,}") # 数据收集器 data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) # 评估指标计算函数(增强调试版) def compute_metrics(p): try: # 确保输入格式正确 if hasattr(p, "predictions") and hasattr(p, "label_ids"): predictions = p.predictions labels = p.label_ids elif isinstance(p, tuple) and len(p) == 2: predictions, labels = p else: logger.error(f"无法识别的输入格式: {type(p)}") return {"f1": 0.0, "precision": 0.0, "recall": 0.0, "accuracy": 0.0} # 确保预测结果有正确的形状 if predictions.ndim == 3: predictions = np.argmax(predictions, axis=2) # 移除忽略的索引(-100) true_predictions = [] true_labels = [] for i in range(len(predictions)): preds = [] lbls = [] for j in range(len(predictions[i])): if labels[i][j] != -100: preds.append(id2tag[predictions[i][j]]) lbls.append(id2tag[labels[i][j]]) true_predictions.append(preds) true_labels.append(lbls) # 计算指标 precision = 0.0 recall = 0.0 f1 = 0.0 accuracy = 0.0 if true_labels and any(true_labels): # 确保标签列表非空 try: precision = precision_score(true_labels, true_predictions, zero_division=0) recall = recall_score(true_labels, true_predictions, zero_division=0) f1 = f1_score(true_labels, true_predictions, zero_division=0) # 计算准确率(token级别) total = 0 correct = 0 for preds, lbls in zip(true_predictions, true_labels): for p, l in zip(preds, lbls): total += 1 if p == l: correct += 1 accuracy = correct / total if total > 0 else 0.0 # 生成分类报告 report = classification_report(true_labels, true_predictions, output_dict=True, zero_division=0) # 记录实体标签的性能 logger.info("\n===== 详细分类报告 =====") for tag in tags: if tag in report and tag != 'O': logger.info(f"{tag}: P={report[tag]['precision']:.4f}, R={report[tag]['recall']:.4f}, F1={report[tag]['f1-score']:.4f}") # 记录前10个预测和标签 logger.info("\n===== 样本预测检查 =====") for i in range(min(3, len(true_labels))): logger.info(f"样本 {i} 预测: {' '.join(true_predictions[i][:20])}") logger.info(f"样本 {i} 标签: {' '.join(true_labels[i][:20])}") logger.info("") # 计算并记录非O标签的准确率 non_o_correct = 0 non_o_total = 0 for preds, lbls in zip(true_predictions, true_labels): for p, l in zip(preds, lbls): if l != 'O': non_o_total += 1 if p == l: non_o_correct += 1 non_o_accuracy = non_o_correct / non_o_total if non_o_total > 0 else 0.0 logger.info(f"非O标签准确率: {non_o_accuracy:.4f} ({non_o_correct}/{non_o_total})") except Exception as inner_e: logger.error(f"计算评估指标时出错: {inner_e}") logger.error(f"预测形状: {predictions.shape}") logger.error(f"标签形状: {labels.shape}") logger.error(f"true_labels长度: {len(true_labels)}") logger.error(f"true_labels内容示例: {true_labels[:1] if true_labels else '空'}") else: logger.warning("评估时没有有效标签!") return { "f1": f1, "precision": precision, "recall": recall, "accuracy": accuracy } except Exception as outer_e: logger.error(f"compute_metrics函数发生严重错误: {outer_e}") return { "f1": 0.0, "precision": 0.0, "recall": 0.0, "accuracy": 0.0 } # 创建Trainer trainer = Trainer( model=model, args=args, train_dataset=ds1['train'], eval_dataset=ds1['test'], data_collator=data_collator, compute_metrics=compute_metrics, tokenizer=tokenizer ) # 预评估测试 logger.info("进行预评估测试...") try: eval_result = trainer.evaluate() logger.info(f"预评估结果: {eval_result}") # 检查是否返回了我们的指标 if "eval_f1" in eval_result: logger.info("评估指标计算成功!") # 重新启用最佳模型保存 args.load_best_model_at_end = True args.metric_for_best_model = "f1" # 使用基础名称,Trainer会自动添加"eval_"前缀 trainer = Trainer( model=model, args=args, train_dataset=ds1['train'], eval_dataset=ds1['test'], data_collator=data_collator, compute_metrics=compute_metrics, tokenizer=tokenizer ) else: logger.warning("评估指标未返回,将禁用最佳模型保存功能") except Exception as e: logger.error(f"预评估失败: {e}") # 训练模型 logger.info("开始训练...") try: train_result = trainer.train() logger.info("训练完成!") except Exception as e: logger.error(f"训练失败: {e}") # 尝试更小的学习率 logger.info("尝试更小的学习率...") args.learning_rate = 1e-5 trainer = Trainer( model=model, args=args, train_dataset=ds1['train'], eval_dataset=ds1['test'], data_collator=data_collator, compute_metrics=compute_metrics, tokenizer=tokenizer ) train_result = trainer.train() logger.info("训练完成!") # 保存最终模型 trainer.save_model("final_ner_model") logger.info("最终模型已保存到 final_ner_model 目录") # 训练完成后评估模型 logger.info("\n===== 最终评估结果 =====") try: eval_results = trainer.evaluate(ds1['test']) logger.info(f"验证集准确率: {eval_results.get('eval_accuracy', 0.0):.4f}") logger.info(f"验证集精确率: {eval_results.get('eval_precision', 0.0):.4f}") logger.info(f"验证集召回率: {eval_results.get('eval_recall', 0.0):.4f}") logger.info(f"验证集F1值: {eval_results.get('eval_f1', 0.0):.4f}") # 打印所有可用的评估指标 logger.info("\n所有可用评估指标:") for key, value in eval_results.items(): logger.info(f"{key}: {value}") except Exception as e: logger.error(f"最终评估失败: {e}") # 绘制训练损失曲线 def plot_training_history(history): try: train_loss = [log['loss'] for log in history if 'loss' in log] eval_loss = [log['eval_loss'] for log in history if 'eval_loss' in log] if not train_loss: logger.warning("没有训练损失数据可绘制") return epochs = list(range(1, len(train_loss) + 1)) plt.figure(figsize=(12, 6)) plt.plot(epochs, train_loss, 'b-', label='训练损失') if eval_loss: plt.plot(epochs[:len(eval_loss)], eval_loss, 'r-', label='验证损失') plt.title('训练和验证损失') plt.xlabel('Epochs') plt.ylabel('损失') plt.legend() plt.grid(True) plt.savefig('training_loss.png') logger.info("训练损失图已保存为 training_loss.png") # 绘制F1分数(如果可用) eval_f1 = [log['eval_f1'] for log in history if 'eval_f1' in log] if eval_f1: plt.figure(figsize=(12, 6)) plt.plot(epochs[:len(eval_f1)], eval_f1, 'g-', label='验证F1') plt.title('验证F1分数') plt.xlabel('Epochs') plt.ylabel('F1分数') plt.legend() plt.grid(True) plt.savefig('eval_f1.png') logger.info("F1分数图已保存为 eval_f1.png") except Exception as e: logger.error(f"绘制训练历史时出错: {e}") # 绘制训练历史 plot_training_history(trainer.state.log_history) # 测试模型预测 logger.info("\n测试模型预测...") sample_texts = [ "我在北京大学学习人工智能", "马云是阿里巴巴集团的创始人", "上海市浦东新区张江高科技园区" ] # 自定义预测函数 def predict_entities(text, model, tokenizer, id2tag, device): try: # 分词 inputs = tokenizer( text, return_tensors="pt", return_offsets_mapping=True, truncation=True, max_length=128 ) # 移动到设备 input_ids = inputs['input_ids'].to(device) attention_mask = inputs['attention_mask'].to(device) token_type_ids = inputs.get('token_type_ids', None) if token_type_ids is not None: token_type_ids = token_type_ids.to(device) offset_mapping = inputs['offset_mapping'].cpu().numpy()[0] # 预测 model.eval() with torch.no_grad(): outputs = model( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids ) # 获取预测结果 logits = outputs.logits predictions = torch.argmax(logits, dim=-1).cpu().numpy()[0] # 提取实体 entities = [] current_entity = None tokens = tokenizer.convert_ids_to_tokens(input_ids[0]) logger.info(f"\n预测文本: {text}") logger.info(f"Token列表: {tokens}") for i, (pred, offset) in enumerate(zip(predictions, offset_mapping)): # 跳过特殊token if offset[0] == 0 and offset[1] == 0: continue tag = id2tag[pred] logger.info(f"Token {i}: {tokens[i]} -> {tag} (位置: {offset[0]}-{offset[1]})") # 处理实体 if tag.startswith('B-'): # 结束前一个实体 if current_entity: entities.append(current_entity) # 开始新实体 entity_type = tag[2:] start = offset[0] end = offset[1] current_entity = { 'start': start, 'end': end, 'type': entity_type, 'text': text[start:end] } elif tag.startswith('I-'): # 继续当前实体 if current_entity and current_entity['type'] == tag[2:]: current_entity['end'] = offset[1] current_entity['text'] = text[current_entity['start']:offset[1]] else: # 如果I标签没有匹配的B标签,则创建新实体 entity_type = tag[2:] start = offset[0] end = offset[1] current_entity = { 'start': start, 'end': end, 'type': entity_type, 'text': text[start:end] } else: # O # 结束前一个实体 if current_entity: entities.append(current_entity) current_entity = None # 添加最后一个实体 if current_entity: entities.append(current_entity) return entities except Exception as e: logger.error(f"预测过程中出错: {e}") return [] # 进行预测 for text in sample_texts: logger.info(f"\n===== 预测文本: {text} =====") try: entities = predict_entities(text, model, tokenizer, id2tag, device) if not entities: logger.info(" 未识别到实体") else: for ent in entities: logger.info(f" 实体: {ent['text']}, 类型: {ent['type']}, 位置: {ent['start']}-{ent['end']}") except Exception as e: logger.error(f"预测文本 '{text}' 时出错: {e}") logger.info("\nNER任务完成!")这代码都实现了什么还有可以优化的嘛

辰可爱啊
  • 粉丝: 30
上传资源 快速赚钱