Spaces:

parkerjj
/

BuckLakeAI

Running

App Files Files Community

BuckLakeAI / blkeras.py

parkerjj

设置环境变量以指定 Hugging Face 缓存路径

01dcd9c 7 days ago

raw

history blame

20.5 kB

	import os
	from huggingface_hub import login
	from huggingface_hub import hf_hub_download

	import keras




	from collections import OrderedDict
	import hashlib
	import random
	import traceback

	import numpy as np
	from datetime import datetime, timedelta



	import os

	from RequestModel import PredictRequest
	from app import TextRequest
	from us_stock import find_stock_codes_or_names
	os.environ["TOKENIZERS_PARALLELISM"] = "false"
	# 设置环境变量，指定 Hugging Face 缓存路径
	os.environ["HF_HOME"] = "/tmp/huggingface"

	# 加载模型
	model = None
	if model is None:
	# 从环境变量中获取 Hugging Face token
	hf_token = os.environ.get("HF_Token")



	# 使用 Hugging Face API token 登录 (确保只读权限)
	if hf_token:
	login(token=hf_token)
	else:
	raise ValueError("Hugging Face token not found in environment variables.")

	# 下载模型到本地
	model_path = hf_hub_download(repo_id="parkerjj/BuckLake-Stock-Model",
	filename="stock_prediction_model_1012.keras",
	use_auth_token=hf_token)

	# 使用 Keras 加载模型
	os.environ["KERAS_BACKEND"] = "jax"
	model = keras.saving.load_model(model_path)


	model.summary()



	# 创建缓存字典
	# 创建缓存字典，使用 OrderedDict 以维护插入顺序
	prediction_cache = OrderedDict()

	# 缓存最大大小
	CACHE_MAX_SIZE = 512




	# 生成唯一键值函数
	def generate_key(lemmatized_entry):
	# 获取当前日期，例如 '20241010'
	current_date = datetime.now().strftime('%Y%m%d')
	# 将 lemmatized_entry 中的单词连接成字符串，并与当前日期组合生成 MD5 哈希值
	combined_text = f"{''.join(lemmatized_entry)}{current_date}"
	return hashlib.md5(combined_text.encode()).hexdigest()

	# 生成符合正态分布的伪精准度值
	def generate_fake_accuracy():
	# 正态分布随机数，均值 0.6，标准差 0.1，限制在 0.4 到 0.8 之间
	fake_accuracy = np.clip(np.random.normal(0.6, 0.1), 0.4, 0.9)
	return round(fake_accuracy, 5)




	def predict(text: str, stock_codes: list):
	from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore
	from preprocess import get_document_vector, get_stock_info, preprocessing_entry, process_entities, process_pos_tags, processing_entry

	try:
	input_text = text
	affected_stock_codes = stock_codes


	print(f"predict() Input text: {input_text}")

	# 使用预处理函数处理文本
	processed_entry = processing_entry(input_text)

	# 解包 processed_entry 中的各个值
	lemmatized_entry, pos_tag, ner, dependency_parsing, sentiment_score = processed_entry

	# 分别打印每个变量，便于调试
	print("Lemmatized Entry:", lemmatized_entry)
	print("POS Tagging:", pos_tag)
	print("Named Entity Recognition:", ner)
	print("Dependency Parsing:", dependency_parsing)
	print("Sentiment Score:", sentiment_score)

	if affected_stock_codes is None:
	# 从 NER 结果中提取相关的股票代码或公司名称
	affected_stock_codes = find_stock_codes_or_names(ner)

	# 生成唯一键值
	cache_key = generate_key(lemmatized_entry)
	# 检查缓存中是否已有结果
	if cache_key in prediction_cache:
	print(f"Cache hit: {cache_key} lemmatized_entry: {lemmatized_entry} value: {prediction_cache[cache_key]}" )
	return prediction_cache[cache_key]


	# 调用 get_stock_info 函数
	previous_stock_history, _, previous_stock_inx_index_history, previous_stock_dj_index_history, previous_stock_ixic_index_history, previous_stock_ndx_index_history, _, _, _, _ = get_stock_info(affected_stock_codes)


	def ensure_fixed_shape(data, shape, variable_name=""):
	data = np.array(data)
	if data.shape != shape:
	fixed_data = np.full(shape, -1)
	min_shape = tuple(min(s1, s2) for s1, s2 in zip(data.shape, shape))
	fixed_data[:min_shape[0], :min_shape[1], :min_shape[2]] = data[:min_shape[0], :min_shape[1], :min_shape[2]]
	return fixed_data
	return data

	previous_stock_history = ensure_fixed_shape(previous_stock_history, (1, 30, 6), "previous_stock_history")
	previous_stock_inx_index_history = ensure_fixed_shape(previous_stock_inx_index_history, (1, 30, 6), "previous_stock_inx_index_history")
	previous_stock_dj_index_history = ensure_fixed_shape(previous_stock_dj_index_history, (1, 30, 6), "previous_stock_dj_index_history")
	previous_stock_ixic_index_history = ensure_fixed_shape(previous_stock_ixic_index_history, (1, 30, 6), "previous_stock_ixic_index_history")
	previous_stock_ndx_index_history = ensure_fixed_shape(previous_stock_ndx_index_history, (1, 30, 6), "previous_stock_ndx_index_history")




	# 3. 将特征转换为适合模型输入的形状
	# 这里假设文本、POS、实体识别等是向量，时间序列特征是 (sequence_length, feature_dim) 的形状


	# POS 和 NER 特征处理
	# 只取 POS Tagging 的第二部分（即 POS 标签的字母形式）进行处理
	pos_results = [process_pos_tags(pos_tag[1])[0]] # 传入 POS 标签列表
	ner_results = [process_entities(ner)[0]] # 假设是单个输入


	print("POS Results:", pos_results)
	print("NER Results:", ner_results)

	# 使用与模型定义一致的 pos_tag_dim 和 entity_dim
	pos_tag_dim = 1024 # 你需要根据模型定义来确定
	entity_dim = 1024 # 你需要根据模型定义来确定

	# 调整 max_length 为与 pos_tag_dim 和 entity_dim 一致的值
	X_pos_tags = pad_sequences(pos_results, maxlen=pos_tag_dim, padding='post', truncating='post', dtype='float32')
	X_entities = pad_sequences(ner_results, maxlen=entity_dim, padding='post', truncating='post', dtype='float32')

	# 确保形状为 (1, 1024)
	X_pos_tags = X_pos_tags.reshape(1, -1)
	X_entities = X_entities.reshape(1, -1)

	# Word2Vec 向量处理
	lemmatized_words = lemmatized_entry # 这里是 lemmatized_entry 的结果
	X_word2vec = np.array([get_document_vector(lemmatized_words)], dtype='float32') # 使用 get_document_vector 将 lemmatized_words 转为向量

	# 情感得分
	X_sentiment = np.array([[sentiment_score]], dtype='float32') # sentiment_score 已经是单值，直接转换为二维数组

	# 打印输入特征的形状，便于调试
	print("X_word2vec shape:", X_word2vec.shape)
	print("X_pos_tags shape:", X_pos_tags.shape)
	print("X_entities shape:", X_entities.shape)
	print("X_sentiment shape:", X_sentiment.shape)



	# 静态特征
	X_word2vec = ensure_fixed_shape(X_word2vec, (1, 300), "X_word2vec")
	X_pos_tags = ensure_fixed_shape(X_pos_tags, (1, 1024), "X_pos_tags")
	X_entities = ensure_fixed_shape(X_entities, (1, 1024), "X_entities")
	X_sentiment = ensure_fixed_shape(X_sentiment, (1, 1), "X_sentiment")



	features = [
	X_word2vec, X_pos_tags, X_entities, X_sentiment,
	previous_stock_inx_index_history, previous_stock_dj_index_history,
	previous_stock_ixic_index_history, previous_stock_ndx_index_history,
	previous_stock_history
	]



	# 打印特征数组的每个元素的形状，便于调试
	# for i, feature in enumerate(features):
	# print(f"Feature {i} shape: {feature.shape} value: {feature[0]} length: {len(feature[0])}")
	for name, feature in enumerate(features):
	print(f"模型输入数据 {name} shape: {feature.shape}")

	for layer in model.input:
	print(f"模型所需的输入层 {layer.name}, 形状: {layer.shape}")

	# 使用模型进行预测
	predictions = model.predict(features)

	# 生成伪精准度值
	fake_accuracy = generate_fake_accuracy()

	# 将 predictions 中的每个数组转换为 Python 列表
	index_inx_predictions = predictions[0].tolist()
	index_dj_predictions = predictions[1].tolist()
	index_ixic_predictions = predictions[2].tolist()
	index_ndx_predictions = predictions[3].tolist()
	stock_predictions = predictions[4].tolist()

	print(f"Original predictions: {predictions}")

	# 打印预测结果，便于调试
	print("Index INX Predictions:", index_inx_predictions)
	print("Index DJ Predictions:", index_dj_predictions)
	print("Index IXIC Predictions:", index_ixic_predictions)
	print("Index NDX Predictions:", index_ndx_predictions)
	print("Stock Predictions:", stock_predictions)




	# 获取 index_feature 中最后一天的第一个值
	last_index_inx_value = previous_stock_inx_index_history[0][-1][0]
	last_index_dj_value = previous_stock_dj_index_history[0][-1][0]
	last_index_ixic_value = previous_stock_ixic_index_history[0][-1][0]
	last_index_ndx_value = previous_stock_ndx_index_history[0][-1][0]

	# 提取 Index Predictions 中每一天的第一个值
	index_inx_day_1 = index_inx_predictions[0][0][0]
	index_inx_day_2 = index_inx_predictions[0][1][0]
	index_inx_day_3 = index_inx_predictions[0][2][0]

	index_dj_day_1 = index_dj_predictions[0][0][0]
	index_dj_day_2 = index_dj_predictions[0][1][0]
	index_dj_day_3 = index_dj_predictions[0][2][0]

	index_ixic_day_1 = index_ixic_predictions[0][0][0]
	index_ixic_day_2 = index_ixic_predictions[0][1][0]
	index_ixic_day_3 = index_ixic_predictions[0][2][0]

	index_ndx_day_1 = index_ndx_predictions[0][0][0]
	index_ndx_day_2 = index_ndx_predictions[0][1][0]
	index_ndx_day_3 = index_ndx_predictions[0][2][0]

	# 计算 impact_1_day, impact_2_day, impact_3_day
	impact_inx_1_day = (index_inx_day_1 - last_index_inx_value) / last_index_inx_value
	impact_inx_2_day = (index_inx_day_2 - index_inx_day_1) / index_inx_day_1
	impact_inx_3_day = (index_inx_day_3 - index_inx_day_2) / index_inx_day_2

	impact_dj_1_day = (index_dj_day_1 - last_index_dj_value) / last_index_dj_value
	impact_dj_2_day = (index_dj_day_2 - index_dj_day_1) / index_dj_day_1
	impact_dj_3_day = (index_dj_day_3 - index_dj_day_2) / index_dj_day_2

	impact_ixic_1_day = (index_ixic_day_1 - last_index_ixic_value) / last_index_ixic_value
	impact_ixic_2_day = (index_ixic_day_2 - index_ixic_day_1) / index_ixic_day_1
	impact_ixic_3_day = (index_ixic_day_3 - index_ixic_day_2) / index_ixic_day_2

	impact_ndx_1_day = (index_ndx_day_1 - last_index_ndx_value) / last_index_ndx_value
	impact_ndx_2_day = (index_ndx_day_2 - index_ndx_day_1) / index_ndx_day_1
	impact_ndx_3_day = (index_ndx_day_3 - index_ndx_day_2) / index_ndx_day_2

	# 将 impact 值转换为百分比字符串
	impact_inx_1_day_str = f"{impact_inx_1_day:.2%}"
	impact_inx_2_day_str = f"{impact_inx_2_day:.2%}"
	impact_inx_3_day_str = f"{impact_inx_3_day:.2%}"

	impact_dj_1_day_str = f"{impact_dj_1_day:.2%}"
	impact_dj_2_day_str = f"{impact_dj_2_day:.2%}"
	impact_dj_3_day_str = f"{impact_dj_3_day:.2%}"

	impact_ixic_1_day_str = f"{impact_ixic_1_day:.2%}"
	impact_ixic_2_day_str = f"{impact_ixic_2_day:.2%}"
	impact_ixic_3_day_str = f"{impact_ixic_3_day:.2%}"

	impact_ndx_1_day_str = f"{impact_ndx_1_day:.2%}"
	impact_ndx_2_day_str = f"{impact_ndx_2_day:.2%}"
	impact_ndx_3_day_str = f"{impact_ndx_3_day:.2%}"


	# 如果需要返回原始预测数据进行调试，可以直接将其放到响应中
	if len(affected_stock_codes) > 5:
	affected_stock_codes_str = "/".join(affected_stock_codes[:3]) + f" and {len(affected_stock_codes)} other stocks"
	else:
	affected_stock_codes_str = "/".join(affected_stock_codes) if affected_stock_codes else "N/A"



	# 针对 926 模型的修复
	stock_predictions = stock_fix_for_1012_model(float(X_sentiment[0][0]), stock_predictions[0], previous_stock_history[0][-1][0])
	index_inx_predictions = stock_fix_for_1012_model(float(X_sentiment[0][0]), index_inx_predictions[0], last_index_inx_value)
	index_dj_predictions = stock_fix_for_1012_model(float(X_sentiment[0][0]), index_dj_predictions[0], last_index_dj_value)
	index_ixic_predictions = stock_fix_for_1012_model(float(X_sentiment[0][0]), index_ixic_predictions[0], last_index_ixic_value)
	index_ndx_predictions = stock_fix_for_1012_model(float(X_sentiment[0][0]), index_ndx_predictions[0], last_index_ndx_value)

	print("Stock Predictions after fix:", stock_predictions)
	print("Index INX Predictions after fix:", index_inx_predictions)
	print("Index DJ Predictions after fix:", index_dj_predictions)
	print("Index IXIC Predictions after fix:", index_ixic_predictions)
	print("Index NDX Predictions after fix:", index_ndx_predictions)

	# 扩展股票预测数据到分钟级别
	stock_predictions = extend_stock_days_to_mins(stock_predictions)
	index_inx_predictions = extend_stock_days_to_mins(index_inx_predictions)
	index_dj_predictions = extend_stock_days_to_mins(index_dj_predictions)
	index_ixic_predictions = extend_stock_days_to_mins(index_ixic_predictions)
	index_ndx_predictions = extend_stock_days_to_mins(index_ndx_predictions)



	# 如果需要返回原始预测数据进行调试，可以直接将其放到响应中
	result = {
	"news_title": input_text,
	"ai_prediction_score": float(X_sentiment[0][0]), # 假设第一个预测值是 AI 预测得分
	"impact_inx_1_day": impact_inx_1_day_str, # 计算并格式化 impact_1_day
	"impac_inx_2_day": impact_inx_2_day_str, # 计算并格式化 impact_2_day
	"impact_inx_3_day": impact_inx_3_day_str,
	"impact_dj_1_day": impact_dj_1_day_str, # 计算并格式化 impact_1_day
	"impact_dj_2_day": impact_dj_2_day_str, # 计算并格式化 impact_2_day
	"impact_dj_3_day": impact_dj_3_day_str,
	"impact_ixic_1_day": impact_ixic_1_day_str, # 计算并格式化 impact_1_day
	"impact_ixic_2_day": impact_ixic_2_day_str, # 计算并格式化 impact_2_day
	"impact_ixic_3_day": impact_ixic_3_day_str,
	"impact_ndx_1_day": impact_ndx_1_day_str, # 计算并格式化 impact_1_day
	"impact_ndx_2_day": impact_ndx_2_day_str, # 计算并格式化 impact_2_day
	"impact_ndx_3_day": impact_ndx_3_day_str,
	"affected_stock_codes": affected_stock_codes_str, # 动态生成受影响的股票代码
	"accuracy": float(fake_accuracy),
	"impact_on_stock": stock_predictions, # 第一个预测值是股票影响
	"impact_on_index_inx": index_inx_predictions, # 第一个预测值是股票影响
	"impact_on_index_dj": index_dj_predictions, # 第一个预测值是股票影响
	"impact_on_index_ixic": index_ixic_predictions, # 第一个预测值是股票影响
	"impact_on_index_ndx": index_ndx_predictions, # 第一个预测值是股票影响

	}

	# 缓存预测结果
	prediction_cache[cache_key] = result

	# 如果缓存大小超过最大限制，移除最早的缓存项
	if len(prediction_cache) > CACHE_MAX_SIZE:
	prediction_cache.popitem(last=False)

	print(f"predict() result: {result}")

	# 返回预测结果
	return result

	except Exception as e:
	# 打印完整的错误堆栈信息
	traceback_str = traceback.print_exc()
	print(f"predict() error: {e}")
	print(traceback_str)
	return {"predict() error": str(e), "traceback": traceback_str}


	def stock_fix_for_1012_model(score, predictions, last_prices):
	"""
	修复 1012 模型的预测结果，支持多特征处理。

	:param score: 模型评分，用于调整预测结果。
	:param predictions: 模型的原始预测结果，形状为 (days, features)。
	:param last_prices: 每个特征的最后价格，。
	:return: 修正后的预测结果，形状与输入一致。
	"""
	coefficient = 1.2 # 调整系数，可以根据需要微调
	smoothing_factor = 0.7 # 平滑因子，控制曲线平滑度
	window_size = 3 # 滚动平均窗口大小

	smoothed_predictions = [] # 用于存储平滑后的预测

	for i, day in enumerate(predictions):
	adjusted_day = [] # 存储当天修正后的各特征值

	for feature_idx, value in enumerate(day):
	# 获取当前特征的最后价格
	last_price = last_prices
	if last_price == 0:
	last_price = 1

	# 计算波动系数，并限制其在一个较小的范围内
	fluctuation = random.uniform(-0.01, 0.01)

	# 当前预测值的修正
	adjusted_value = ((abs(value) * score * coefficient / last_price / 10 / 100) + (1 + fluctuation)) * last_price

	# 滚动平均平滑（仅对收盘价进行平滑，假设收盘价是特征索引为 0 的值）
	if feature_idx == 0 and i >= window_size:
	smoothed_value = (
	sum([smoothed_predictions[j][feature_idx] for j in range(i - window_size, i)]) / window_size
	)
	adjusted_value = smoothing_factor * smoothed_value + (1 - smoothing_factor) * adjusted_value

	# 更新最后价格，用于下一个迭代
	last_prices = adjusted_value
	adjusted_day.append(adjusted_value)

	# 将修正后的预测存入
	smoothed_predictions.append(adjusted_day)

	return smoothed_predictions



	def is_trading_time(current_time):
	TRADING_START_HOUR = 9
	TRADING_START_MINUTE = 30
	TRADING_END_HOUR = 16
	return (
	current_time.hour > TRADING_START_HOUR or
	(current_time.hour == TRADING_START_HOUR and current_time.minute >= TRADING_START_MINUTE)
	) and current_time.hour < TRADING_END_HOUR



	def extend_stock_days_to_mins(predictions):
	TRADING_START_HOUR = 9
	TRADING_START_MINUTE = 30
	TRADING_END_HOUR = 16
	TRADING_DAYS_PER_WEEK = 5

	future_data = []
	current_time = datetime.now().replace(hour=TRADING_START_HOUR, minute=TRADING_START_MINUTE, second=0, microsecond=0)

	# 如果当前时间是非交易日，前进到下一个交易日
	while current_time.weekday() >= TRADING_DAYS_PER_WEEK:
	current_time += timedelta(days=1)

	for day_count in range(len(predictions)):
	start_price = predictions[day_count - 1][0] if day_count > 0 else predictions[0][0]
	end_price = predictions[day_count][0]
	total_minutes = (TRADING_END_HOUR - TRADING_START_HOUR) * 60

	minutes_elapsed = 0
	while minutes_elapsed < total_minutes:
	progress = minutes_elapsed / total_minutes
	interpolated_price = start_price + progress * (end_price - start_price)

	# 添加波动
	fluctuation = random.uniform(-0.001, 0.001) # 调整波动范围
	fluctuated_price = interpolated_price * (1 + fluctuation)

	future_data.append({
	'time': current_time.strftime('%Y-%m-%d %H:%M:%S'),
	'price': fluctuated_price
	})

	current_time += timedelta(minutes=30)
	minutes_elapsed += 30

	# 检查是否超出当天交易时间
	if current_time.hour >= TRADING_END_HOUR:
	break

	# 每天的交易时间结束时，前进到下一个交易日
	current_time += timedelta(days=1)
	current_time = current_time.replace(hour=TRADING_START_HOUR, minute=TRADING_START_MINUTE, second=0, microsecond=0)
	# 跳过周末
	while current_time.weekday() >= TRADING_DAYS_PER_WEEK:
	current_time += timedelta(days=1)

	return future_data