QINGCHE commited on
Commit
cbc1d23
1 Parent(s): 1c529f8
__pycache__/classification.cpython-39.pyc CHANGED
Binary files a/__pycache__/classification.cpython-39.pyc and b/__pycache__/classification.cpython-39.pyc differ
 
__pycache__/run.cpython-39.pyc ADDED
Binary file (1.11 kB). View file
 
__pycache__/textInput.cpython-39.pyc ADDED
Binary file (705 Bytes). View file
 
__pycache__/util.cpython-39.pyc CHANGED
Binary files a/__pycache__/util.cpython-39.pyc and b/__pycache__/util.cpython-39.pyc differ
 
app.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import gradio as gr
3
+ import textInput
4
+
5
+ output = []
6
+ keys = []
7
+
8
+
9
+ with gr.Blocks() as demo:
10
+ #用markdown语法编辑输出一段话
11
+ gr.Markdown("# 文本分类系统")
12
+ gr.Markdown("请选择要输入的文件或填入文本")
13
+ topic_num = gr.Textbox()
14
+ max_length = gr.Textbox()
15
+ with gr.Tabs():
16
+ with gr.Tab("文本输入"):
17
+ text_input = gr.Textbox()
18
+ text_button = gr.Button("生成")
19
+
20
+ with gr.Tab("文件输入"):
21
+ gr.Markdown("目前支持的格式有PDF、Word、txt")
22
+ file_input = gr.File()
23
+ # 设置tab选项卡
24
+ with gr.Tabs():
25
+ with gr.Tab("分类页"):
26
+ text_keys_output = gr.Textbox()
27
+
28
+ with gr.Tab("摘要页"):
29
+ #Blocks特有组件,设置所有子组件按水平排列
30
+ text_ab_output = gr.Textbox()
31
+ # with gr.Accordion("Open for More!"):
32
+ # gr.Markdown("Look at me...")
33
+ text_button.click(textInput.text_dump_to_json, inputs=[text_input,topic_num,max_length], outputs=[text_keys_output,text_ab_output])
34
+ # image_button.click(flip_image, inputs=image_input, outputs=image_output)
35
+ demo.launch()
classification.py CHANGED
@@ -8,7 +8,7 @@ import torch
8
 
9
  def classify_by_topic(articles, central_topics):
10
 
11
- # 计算每篇文章与每个中心主题的相似度,返回一个矩阵
12
  def compute_similarity(articles, central_topics):
13
 
14
  model = AutoModel.from_pretrained("distilbert-base-multilingual-cased")
 
8
 
9
  def classify_by_topic(articles, central_topics):
10
 
11
+ # 计算与每个中心主题的相似度,返回一个矩阵
12
  def compute_similarity(articles, central_topics):
13
 
14
  model = AutoModel.from_pretrained("distilbert-base-multilingual-cased")
run.py CHANGED
@@ -1,56 +1,47 @@
1
- import util
2
- import abstract
3
- import classification
4
- import inference
5
- import outline
6
- from inference import BertClassificationModel
7
- # input:file/text,topic_num,max_length,output_choice
8
- # output:file/text/topic_sentence
9
-
10
-
11
- # file_process:
12
- # in util
13
- # read file code
14
- # file to json_text
15
-
16
- # convert:
17
- # in util
18
- # convert code
19
- # json_text to text
20
-
21
- # process:
22
- # in util
23
- # text process code
24
- # del stop seg
25
-
26
- text = "我今天的调研内容是大模型训练的关键技术与挑战。在现代机器学习任务中,大模型训练已成为解决复杂问题的重要手段。在本次报告中,我将介绍分布式并行加速、算法模型架构、内存和计算优化以及集群架构等关键技术。首先,分布式并行加速策略,包括数据并行、模型并行、流水线并行和张量并行等四种方式。这些策略帮助我们将训练数据和模型分布到多个设备上,以加速大模型训练过程。接下来,我们将介绍算法模型架构。Transformer网络模型是一种应用广泛的神经网络模型,基于自注意力机制。它在自然语言处理和计算机视觉任务中取得了显著的成果。此外,适用于万亿级稀疏场景的MoE模型,它通过混合专家模型来处理稀疏数据,具有良好的适应性。为了在有限的计算资源下实现大模型训练,我们需要采用内存和计算优化技术。在内存优化方面,我们激活重计算、内存高效优化器和模型压缩等技术。这些技术可以减少内存占用、降低内存消耗,从而提高训练效率。在计算优化方面,混合精度训练、算子融合和梯度累加等技术,以减少计算资源需求,进一步提升训练速度。最后,我们将讨论大模型训练的集群架构。选择合适的集群架构是实现大模型的分布式训练的关键。我们将介绍参数服务器模式(PS)和集合通讯模式(CC)两种流行的集群架构。PS架构通过Server和Worker之间的通信来更新模型参数,而CC模式中每个节点都是工作节点,负责模型训练并掌握当前最新的全局梯度信息。这些集群架构在大模型训练中起到了关键作用,帮助实现分布式训练并提高训练效率。综上所述,大模型训练需要综合考虑分布式并行加速、算法模型架构、内存和计算优化以及集群架构等多个方面。通过合理地优化这些方面,我们可以实现更高效的大模型训练,解决各种规模的机器学习问题。大模型训练的发展为我们提供了更多创新和突破的机会。大数据技术也为实现人工智能的进步和应用做出重要贡献。谢谢大家!"
27
- topic_num = 5
28
- max_length = 50
29
-
30
- article = util.seg(text)
31
- print(article)
32
-
33
- sentences = [util.clean_text(sentence) for sentence in article]
34
-
35
- central_sentences = abstract.abstruct_main(sentences, topic_num)
36
- print(central_sentences)
37
-
38
- groups = classification.classify_by_topic(article, central_sentences)
39
- print(groups)
40
-
41
- groups = util.article_to_group(groups, central_sentences)
42
-
43
- title_dict,title = util.generation(groups, max_length)
44
- # ans:
45
- # {Ai_abstruct:(main_sentence,paragraph)}
46
- for i in title_dict.items():
47
- print(i)
48
-
49
- matrix = inference.inference_matrix(title)
50
- print(matrix)
51
-
52
- text_outline,outline_list = outline.passage_outline(matrix,title)
53
- print(text_outline)
54
-
55
- output = util.formate_text(title_dict,outline_list)
56
- print (output)
 
1
+ import util
2
+ import abstract
3
+ import classification
4
+ import inference
5
+ import outline
6
+ from inference import BertClassificationModel
7
+ # input:file/text,topic_num,max_length,output_choice
8
+ # output:file/text/topic_sentence
9
+
10
+
11
+ # file_process:
12
+ # in util
13
+ # read file code
14
+ # file to json_text
15
+
16
+ # convert:
17
+ # in util
18
+ # convert code
19
+ # json_text to text
20
+
21
+ # process:
22
+ # in util
23
+ # text process code
24
+ # del stop seg
25
+
26
+ def texClear(article):
27
+ sentencesCleared = [util.clean_text(sentence) for sentence in article]
28
+ return sentencesCleared
29
+
30
+ def textToAb(sentences, article, topic_num, max_length):
31
+ central_sentences = abstract.abstruct_main(sentences, topic_num)
32
+ groups = classification.classify_by_topic(article, central_sentences)
33
+ groups = util.article_to_group(groups, central_sentences)
34
+ title_dict,title = util.generation(groups, max_length)
35
+ # ans:
36
+ # {Ai_abstruct:(main_sentence,paragraph)}
37
+
38
+ matrix = inference.inference_matrix(title)
39
+
40
+ _,outline_list = outline.passage_outline(matrix,title)
41
+
42
+ output = util.formate_text(title_dict,outline_list)
43
+ keys = []
44
+ for key in title.keys():
45
+ keys.append(key)
46
+
47
+ return keys, output
 
 
 
 
 
 
 
 
 
textInput.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import run
2
+
3
+ def text_dump_to_json(text):
4
+ lines = [x.strip() for x in text.split("\n") if x.strip()!='']
5
+ data = {"text":lines}
6
+ sentences = run.texClear(lines)
7
+ keys, output = run.textToAb(sentences,lines,5,50)
8
+ return keys, output
9
+
10
+ def file_dump_to_json(file):
11
+
12
+ return