Artteiv tosanoob commited on
Commit
3826b3b
1 Parent(s): 3444cc2

Refactoring gemini functions (#9)

Browse files

- Refactoring gemini functions (b483e15951ebe8673817778370aa68a016fa09fa)


Co-authored-by: Trương Tấn Cường <[email protected]>

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +37 -35
  2. .gitignore +4 -6
  3. arxivdb/4b0c8007-4402-4129-b225-8bd2a39f0757/data_level0.bin +3 -0
  4. arxivdb/4b0c8007-4402-4129-b225-8bd2a39f0757/header.bin +3 -0
  5. arxivdb/4b0c8007-4402-4129-b225-8bd2a39f0757/index_metadata.pickle +3 -0
  6. arxivdb/4b0c8007-4402-4129-b225-8bd2a39f0757/length.bin +3 -0
  7. arxivdb/4b0c8007-4402-4129-b225-8bd2a39f0757/link_lists.bin +3 -0
  8. arxivdb/6a7f20ca-1ffd-40b4-9707-7d9628097d5a/data_level0.bin +3 -0
  9. arxivdb/6a7f20ca-1ffd-40b4-9707-7d9628097d5a/header.bin +3 -0
  10. arxivdb/6a7f20ca-1ffd-40b4-9707-7d9628097d5a/index_metadata.pickle +3 -0
  11. arxivdb/6a7f20ca-1ffd-40b4-9707-7d9628097d5a/length.bin +3 -0
  12. arxivdb/6a7f20ca-1ffd-40b4-9707-7d9628097d5a/link_lists.bin +3 -0
  13. arxivdb/7e557f5a-be88-4080-aa85-e3bcd927fcf9/data_level0.bin +3 -0
  14. arxivdb/7e557f5a-be88-4080-aa85-e3bcd927fcf9/header.bin +3 -0
  15. arxivdb/7e557f5a-be88-4080-aa85-e3bcd927fcf9/index_metadata.pickle +3 -0
  16. arxivdb/7e557f5a-be88-4080-aa85-e3bcd927fcf9/length.bin +3 -0
  17. arxivdb/7e557f5a-be88-4080-aa85-e3bcd927fcf9/link_lists.bin +3 -0
  18. arxivdb/951c5fab-677a-4406-b99b-b4006c3423a2/data_level0.bin +3 -0
  19. arxivdb/951c5fab-677a-4406-b99b-b4006c3423a2/header.bin +3 -0
  20. arxivdb/951c5fab-677a-4406-b99b-b4006c3423a2/index_metadata.pickle +3 -0
  21. arxivdb/951c5fab-677a-4406-b99b-b4006c3423a2/length.bin +3 -0
  22. arxivdb/951c5fab-677a-4406-b99b-b4006c3423a2/link_lists.bin +3 -0
  23. arxivdb/chroma.sqlite3 +3 -0
  24. arxivdb/chromadb.sqlite3 +0 -0
  25. chat/__init__.py +2 -2
  26. chat/__pycache__/__init__.cpython-311.pyc +0 -0
  27. chat/__pycache__/apps.cpython-311.pyc +0 -0
  28. chat/__pycache__/consumers.cpython-311.pyc +0 -0
  29. chat/__pycache__/model_manage.cpython-311.pyc +0 -0
  30. chat/__pycache__/model_manage2.cpython-311.pyc +0 -0
  31. chat/__pycache__/routing.cpython-311.pyc +0 -0
  32. chat/__pycache__/urls.cpython-311.pyc +0 -0
  33. chat/__pycache__/views.cpython-311.pyc +0 -0
  34. chat/arxiv_bot/__pycache__/arxiv_bot_utils.cpython-311.pyc +0 -0
  35. chat/arxiv_bot/__pycache__/arxiv_bot_utils2.cpython-311.pyc +0 -0
  36. chat/arxiv_bot/arxiv_bot_utils.py +248 -248
  37. chat/arxiv_bot/arxiv_bot_utils2.py +297 -0
  38. chat/arxiv_bot/prebuild.ipynb +354 -354
  39. chat/consumers.py +10 -6
  40. chat/migrations/__pycache__/0001_initial.cpython-311.pyc +0 -0
  41. chat/migrations/__pycache__/__init__.cpython-311.pyc +0 -0
  42. chat/model_manage.py +238 -238
  43. chat/model_manage2.py +174 -0
  44. chatbot_django/__pycache__/__init__.cpython-311.pyc +0 -0
  45. chatbot_django/__pycache__/asgi.cpython-311.pyc +0 -0
  46. chatbot_django/__pycache__/settings.cpython-311.pyc +0 -0
  47. chatbot_django/__pycache__/urls.cpython-311.pyc +0 -0
  48. concat.txt +0 -0
  49. db.sqlite3 +0 -0
  50. models/models--jinaai--jina-bert-implementation/blobs/64b6ce6fe4477c320b0ab303e2f26ae98beae1f7 +0 -0
.gitattributes CHANGED
@@ -1,35 +1,37 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ arxivdb/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
37
+ models/models--jinaai--jina-embeddings-v2-base-en/blobs/6b70f1386f05b9703ea4edf7f1550a8925399f9580e4cc754cc099efc1e736d8 filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -1,7 +1,5 @@
1
- arxivdb/
2
- models/
3
- __pycache__/
4
- *.pyc
5
- apikey.txt
6
- db.sqlite3
7
  hotfix.ipynb
 
1
+ models/
2
+ __pycache__/
3
+ *.pyc
4
+ apikey.txt
 
 
5
  hotfix.ipynb
arxivdb/4b0c8007-4402-4129-b225-8bd2a39f0757/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d96c82cf4183e567eddf45be92064c7d818268621da9821caa2367bb20cba18
3
+ size 32120000
arxivdb/4b0c8007-4402-4129-b225-8bd2a39f0757/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a494575edafaafb2b60f5a2ad563719976abf7ae3a35ca7c9b5aaae36842006c
3
+ size 100
arxivdb/4b0c8007-4402-4129-b225-8bd2a39f0757/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91e4880dca7113b4c3a3644e63aa5809f4a30474d1332f66d5f0ad082fe41833
3
+ size 357939
arxivdb/4b0c8007-4402-4129-b225-8bd2a39f0757/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:814d4b3244fb0f86f8d5beac519239863d973c20c8fec45624d0c0ae54baf9cf
3
+ size 40000
arxivdb/4b0c8007-4402-4129-b225-8bd2a39f0757/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d679f3012c3a4ae23e21dbfce89bb153cab85edef4c19f5340a4464e99f4c014
3
+ size 87396
arxivdb/6a7f20ca-1ffd-40b4-9707-7d9628097d5a/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31ea31ff76723407f460b7534220ef974bfb3a563732c1a85e01fd9b2610dc13
3
+ size 6424000
arxivdb/6a7f20ca-1ffd-40b4-9707-7d9628097d5a/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db5064bd751b93036fa600922f99c2534c183c3335c5267c8c5413a73f450320
3
+ size 100
arxivdb/6a7f20ca-1ffd-40b4-9707-7d9628097d5a/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd938c16ea62b22a52094297d5d570442daba226ad67e941b0254655e843c67a
3
+ size 65937
arxivdb/6a7f20ca-1ffd-40b4-9707-7d9628097d5a/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d9d14a589aeeaf2e86552f9c3f1bb4f556e49244f186540c71bac6c1680e834
3
+ size 8000
arxivdb/6a7f20ca-1ffd-40b4-9707-7d9628097d5a/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3751bd54da338722a3b5370921bf446e34169a639a18beb7145e5d4e9e3778e3
3
+ size 18268
arxivdb/7e557f5a-be88-4080-aa85-e3bcd927fcf9/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86e41597eb04379b7582da7eeb5fb0aaca29eb32749069e69886358370fab575
3
+ size 3212000
arxivdb/7e557f5a-be88-4080-aa85-e3bcd927fcf9/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fdb00e89b6ee7733fd37556b1da3447d9895ad7431512096c0e073ed667a25d0
3
+ size 100
arxivdb/7e557f5a-be88-4080-aa85-e3bcd927fcf9/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd5b11142a96276baf9591e9524a8d9241eb013902301021dddea3a81b61d63a
3
+ size 33934
arxivdb/7e557f5a-be88-4080-aa85-e3bcd927fcf9/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e680a6fe8d1f2bf76260963cf27e0c7bd58c39e9c82262906a285eaf89b1c27d
3
+ size 4000
arxivdb/7e557f5a-be88-4080-aa85-e3bcd927fcf9/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:666164435753a1784160baf485cc1c80e665103e6bd19a1998430f93246f1c29
3
+ size 8624
arxivdb/951c5fab-677a-4406-b99b-b4006c3423a2/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ecac6a0b1c9974d085507909895bec9040788bd20bf184eae140000cef97551d
3
+ size 38544000
arxivdb/951c5fab-677a-4406-b99b-b4006c3423a2/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:269f137da42a494d996ad44046f5e349b59d2d31eca4b39aa82d7ec76f62cdf9
3
+ size 100
arxivdb/951c5fab-677a-4406-b99b-b4006c3423a2/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08461957df6b58835618a34c77a9c96b6dc54f21e04c60c9d10dd36d5b864414
3
+ size 429953
arxivdb/951c5fab-677a-4406-b99b-b4006c3423a2/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e648660e0a36f652356dd7e0210b243cba14b3b7c267c3c05fdc7614b1d2dd03
3
+ size 48000
arxivdb/951c5fab-677a-4406-b99b-b4006c3423a2/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13b650da98a6bd2ec371437494a2cb09a2fae5b67d6eead12af43b40fb548e7c
3
+ size 104644
arxivdb/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c48f817474996b45a3f4da1e127a2fde083db4bfeddb71893d598b8200fb056
3
+ size 123736064
arxivdb/chromadb.sqlite3 ADDED
File without changes
chat/__init__.py CHANGED
@@ -3,7 +3,7 @@ import chat.arxiv_bot.arxiv_bot_utils as utils
3
  import os
4
  from getpass import getpass
5
  import json
6
- from .model_manage import get_model
7
 
8
- model = get_model()
9
 
 
3
  import os
4
  from getpass import getpass
5
  import json
6
+ # from .model_manage import get_model
7
 
8
+ # model = get_model()
9
 
chat/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (358 Bytes). View file
 
chat/__pycache__/apps.cpython-311.pyc ADDED
Binary file (602 Bytes). View file
 
chat/__pycache__/consumers.cpython-311.pyc ADDED
Binary file (1.91 kB). View file
 
chat/__pycache__/model_manage.cpython-311.pyc ADDED
Binary file (164 Bytes). View file
 
chat/__pycache__/model_manage2.cpython-311.pyc ADDED
Binary file (10.5 kB). View file
 
chat/__pycache__/routing.cpython-311.pyc ADDED
Binary file (567 Bytes). View file
 
chat/__pycache__/urls.cpython-311.pyc ADDED
Binary file (456 Bytes). View file
 
chat/__pycache__/views.cpython-311.pyc ADDED
Binary file (601 Bytes). View file
 
chat/arxiv_bot/__pycache__/arxiv_bot_utils.cpython-311.pyc ADDED
Binary file (177 Bytes). View file
 
chat/arxiv_bot/__pycache__/arxiv_bot_utils2.cpython-311.pyc ADDED
Binary file (19.1 kB). View file
 
chat/arxiv_bot/arxiv_bot_utils.py CHANGED
@@ -1,276 +1,276 @@
1
- import chromadb
2
- from chromadb import Documents, EmbeddingFunction, Embeddings
3
- from transformers import AutoModel
4
- import json
5
- from numpy.linalg import norm
6
- import sqlite3
7
- import urllib
8
- from django.conf import settings
9
 
10
 
11
- # this module act as a singleton class
12
 
13
- class JinaAIEmbeddingFunction(EmbeddingFunction):
14
- def __init__(self, model):
15
- super().__init__()
16
- self.model = model
17
 
18
- def __call__(self, input: Documents) -> Embeddings:
19
- embeddings = self.model.encode(input)
20
- return embeddings.tolist()
21
 
22
- # instance of embedding_model
23
- embedding_model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en',
24
- trust_remote_code=True,
25
- cache_dir='models')
26
 
27
- # instance of JinaAIEmbeddingFunction
28
- ef = JinaAIEmbeddingFunction(embedding_model)
29
 
30
- # list of topics
31
- topic_descriptions = json.load(open("topic_descriptions.txt"))
32
- topics = list(dict.keys(topic_descriptions))
33
- embeddings = [embedding_model.encode(topic_descriptions[key]) for key in topic_descriptions]
34
- cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
35
 
36
- def choose_topic(summary):
37
- embed = embedding_model.encode(summary)
38
- topic = ""
39
- max_sim = 0.
40
- for i,key in enumerate(topics):
41
- sim = cos_sim(embed,embeddings[i])
42
- if sim > max_sim:
43
- topic = key
44
- max_sim = sim
45
- return topic
46
 
47
- def authors_list_to_str(authors):
48
- """input a list of authors, return a string represent authors"""
49
- text = ""
50
- for author in authors:
51
- text+=author+", "
52
- return text[:-3]
53
 
54
- def authors_str_to_list(string):
55
- """input a string of authors, return a list of authors"""
56
- authors = []
57
- list_auth = string.split("and")
58
- for author in list_auth:
59
- if author != "et al.":
60
- authors.append(author.strip())
61
- return authors
62
 
63
- def chunk_texts(text, max_char=400):
64
- """
65
- Chunk a long text into several chunks, with each chunk about 300-400 characters long,
66
- but make sure no word is cut in half.
67
- Args:
68
- text: The long text to be chunked.
69
- max_char: The maximum number of characters per chunk (default: 400).
70
- Returns:
71
- A list of chunks.
72
- """
73
- chunks = []
74
- current_chunk = ""
75
- words = text.split()
76
- for word in words:
77
- if len(current_chunk) + len(word) + 1 >= max_char:
78
- chunks.append(current_chunk)
79
- current_chunk = " "
80
- else:
81
- current_chunk += " " + word
82
- chunks.append(current_chunk.strip())
83
- return chunks
84
 
85
- def trimming(txt):
86
- start = txt.find("{")
87
- end = txt.rfind("}")
88
- return txt[start:end+1].replace("\n"," ")
89
 
90
- # crawl data
91
 
92
- def extract_tag(txt,tagname):
93
- return txt[txt.find("<"+tagname+">")+len(tagname)+2:txt.find("</"+tagname+">")]
94
 
95
- def get_record(extract):
96
- id = extract_tag(extract,"id")
97
- updated = extract_tag(extract,"updated")
98
- published = extract_tag(extract,"published")
99
- title = extract_tag(extract,"title").replace("\n ","").strip()
100
- summary = extract_tag(extract,"summary").replace("\n","").strip()
101
- authors = []
102
- while extract.find("<author>")!=-1:
103
- author = extract_tag(extract,"name")
104
- extract = extract[extract.find("</author>")+9:]
105
- authors.append(author)
106
- pattern = '<link title="pdf" href="'
107
- link_start = extract.find('<link title="pdf" href="')
108
- link = extract[link_start+len(pattern):extract.find("rel=",link_start)-2]
109
- return [id, updated, published, title, authors, link, summary]
110
 
111
- def crawl_exact_paper(title,author,max_results=3):
112
- authors = authors_list_to_str(author)
113
- records = []
114
- url = 'http://export.arxiv.org/api/query?search_query=ti:{title}+AND+au:{author}&max_results={max_results}'.format(title=title,author=authors,max_results=max_results)
115
- url = url.replace(" ","%20")
116
- try:
117
- arxiv_page = urllib.request.urlopen(url,timeout=100).read()
118
- xml = str(arxiv_page,encoding="utf-8")
119
- while xml.find("<entry>") != -1:
120
- extract = xml[xml.find("<entry>")+7:xml.find("</entry>")]
121
- xml = xml[xml.find("</entry>")+8:]
122
- extract = get_record(extract)
123
- topic = choose_topic(extract[6])
124
- records.append([topic,*extract])
125
- return records
126
- except Exception as e:
127
- return "Error: "+str(e)
128
 
129
- def crawl_arxiv(keyword_list, max_results=100):
130
- baseurl = 'http://export.arxiv.org/api/query?search_query='
131
- records = []
132
- for i,keyword in enumerate(keyword_list):
133
- if i ==0:
134
- url = baseurl + 'all:' + keyword
135
- else:
136
- url = url + '+OR+' + 'all:' + keyword
137
- url = url+ '&max_results=' + str(max_results)
138
- url = url.replace(' ', '%20')
139
- try:
140
- arxiv_page = urllib.request.urlopen(url,timeout=100).read()
141
- xml = str(arxiv_page,encoding="utf-8")
142
- while xml.find("<entry>") != -1:
143
- extract = xml[xml.find("<entry>")+7:xml.find("</entry>")]
144
- xml = xml[xml.find("</entry>")+8:]
145
- extract = get_record(extract)
146
- topic = choose_topic(extract[6])
147
- records.append([topic,*extract])
148
- return records
149
- except Exception as e:
150
- return "Error: "+str(e)
151
 
152
- class ArxivSQL:
153
- def __init__(self, table="arxivsql", name="db.sqlite3"):
154
- self.con = sqlite3.connect(name)
155
- self.cur = self.con.cursor()
156
- self.table = table
157
 
158
- def query(self, title="", author=[]):
159
- if len(title)>0:
160
- query_title = 'title like "%{}%"'.format(title)
161
- else:
162
- query_title = "True"
163
- if len(author)>0:
164
- query_author = 'authors like '
165
- for auth in author:
166
- query_author += "'%{}%' or ".format(auth)
167
- query_author = query_author[:-4]
168
- else:
169
- query_author = "True"
170
- query = "select * from {} where {} and {}".format(self.table,query_title,query_author)
171
- result = self.cur.execute(query)
172
- return result.fetchall()
173
 
174
- def query_id(self, ids=[]):
175
- try:
176
- if len(ids) == 0:
177
- return None
178
- query = "select * from {} where id in (".format(self.table)
179
- for id in ids:
180
- query+="'"+id+"',"
181
- query = query[:-1] + ")"
182
- result = self.cur.execute(query)
183
- return result.fetchall()
184
- except Exception as e:
185
- print(e)
186
- print("Error query: ",query)
187
 
188
- def add(self, crawl_records):
189
- """
190
- Add crawl_records (list) obtained from arxiv_crawlers
191
- A record is a list of 8 columns:
192
- [topic, id, updated, published, title, author, link, summary]
193
- Return the final length of the database table
194
- """
195
- results = ""
196
- for record in crawl_records:
197
- try:
198
- query = """insert into arxivsql values("{}","{}","{}","{}","{}","{}","{}")""".format(
199
- record[1][21:],
200
- record[0],
201
- record[4].replace('"',"'"),
202
- authors_list_to_str(record[5]),
203
- record[2][:10],
204
- record[3][:10],
205
- record[6]
206
- )
207
- self.cur.execute(query)
208
- self.con.commit()
209
- except Exception as e:
210
- result+=str(e)
211
- result+="\n" + query + "\n"
212
- finally:
213
- return results
214
 
215
- # instance of ArxivSQL
216
- sqldb = ArxivSQL()
217
 
218
- class ArxivChroma:
219
- """
220
- Create an interface to arxivdb, which only support query and addition.
221
- This interface do not support edition and deletion procedures.
222
- """
223
- def __init__(self, table="arxiv_records", name="arxivdb/"):
224
- self.client = chromadb.PersistentClient(name)
225
- self.model = embedding_model
226
- self.collection = self.client.get_or_create_collection(table,
227
- embedding_function=JinaAIEmbeddingFunction(
228
- model = self.model
229
- ))
230
 
231
- def query_relevant(self, keywords, query_texts, n_results=3):
232
- """
233
- Perform a query using a list of keywords (str),
234
- or using a relavant string
235
- """
236
- contains = []
237
- for keyword in keywords:
238
- contains.append({"$contains":keyword.lower()})
239
- return self.collection.query(
240
- query_texts=query_texts,
241
- where_document={
242
- "$or":contains
243
- },
244
- n_results=n_results,
245
- )
246
 
247
- def query_exact(self, id):
248
- ids = ["{}_{}".format(id,j) for j in range(0,10)]
249
- return self.collection.get(ids=ids)
250
 
251
- def add(self, crawl_records):
252
- """
253
- Add crawl_records (list) obtained from arxiv_crawlers
254
- A record is a list of 8 columns:
255
- [topic, id, updated, published, title, author, link, summary]
256
- Return the final length of the database table
257
- """
258
- for record in crawl_records:
259
- embed_text = """
260
- Topic: {},
261
- Title: {},
262
- Summary: {}
263
- """.format(record[0],record[4],record[7])
264
- chunks = chunk_texts(embed_text)
265
- ids = [record[1][21:]+"_"+str(j) for j in range(len(chunks))]
266
- paper_ids = [{"paper_id":record[1][21:]} for _ in range(len(chunks))]
267
- self.collection.add(
268
- documents = chunks,
269
- metadatas=paper_ids,
270
- ids = ids
271
- )
272
- return self.collection.count()
273
 
274
- # instance of ArxivChroma
275
- db = ArxivChroma()
276
 
 
1
+ # import chromadb
2
+ # from chromadb import Documents, EmbeddingFunction, Embeddings
3
+ # from transformers import AutoModel
4
+ # import json
5
+ # from numpy.linalg import norm
6
+ # import sqlite3
7
+ # import urllib
8
+ # from django.conf import settings
9
 
10
 
11
+ # # this module act as a singleton class
12
 
13
+ # class JinaAIEmbeddingFunction(EmbeddingFunction):
14
+ # def __init__(self, model):
15
+ # super().__init__()
16
+ # self.model = model
17
 
18
+ # def __call__(self, input: Documents) -> Embeddings:
19
+ # embeddings = self.model.encode(input)
20
+ # return embeddings.tolist()
21
 
22
+ # # instance of embedding_model
23
+ # embedding_model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en',
24
+ # trust_remote_code=True,
25
+ # cache_dir='models')
26
 
27
+ # # instance of JinaAIEmbeddingFunction
28
+ # ef = JinaAIEmbeddingFunction(embedding_model)
29
 
30
+ # # list of topics
31
+ # topic_descriptions = json.load(open("topic_descriptions.txt"))
32
+ # topics = list(dict.keys(topic_descriptions))
33
+ # embeddings = [embedding_model.encode(topic_descriptions[key]) for key in topic_descriptions]
34
+ # cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
35
 
36
+ # def choose_topic(summary):
37
+ # embed = embedding_model.encode(summary)
38
+ # topic = ""
39
+ # max_sim = 0.
40
+ # for i,key in enumerate(topics):
41
+ # sim = cos_sim(embed,embeddings[i])
42
+ # if sim > max_sim:
43
+ # topic = key
44
+ # max_sim = sim
45
+ # return topic
46
 
47
+ # def authors_list_to_str(authors):
48
+ # """input a list of authors, return a string represent authors"""
49
+ # text = ""
50
+ # for author in authors:
51
+ # text+=author+", "
52
+ # return text[:-3]
53
 
54
+ # def authors_str_to_list(string):
55
+ # """input a string of authors, return a list of authors"""
56
+ # authors = []
57
+ # list_auth = string.split("and")
58
+ # for author in list_auth:
59
+ # if author != "et al.":
60
+ # authors.append(author.strip())
61
+ # return authors
62
 
63
+ # def chunk_texts(text, max_char=400):
64
+ # """
65
+ # Chunk a long text into several chunks, with each chunk about 300-400 characters long,
66
+ # but make sure no word is cut in half.
67
+ # Args:
68
+ # text: The long text to be chunked.
69
+ # max_char: The maximum number of characters per chunk (default: 400).
70
+ # Returns:
71
+ # A list of chunks.
72
+ # """
73
+ # chunks = []
74
+ # current_chunk = ""
75
+ # words = text.split()
76
+ # for word in words:
77
+ # if len(current_chunk) + len(word) + 1 >= max_char:
78
+ # chunks.append(current_chunk)
79
+ # current_chunk = " "
80
+ # else:
81
+ # current_chunk += " " + word
82
+ # chunks.append(current_chunk.strip())
83
+ # return chunks
84
 
85
+ # def trimming(txt):
86
+ # start = txt.find("{")
87
+ # end = txt.rfind("}")
88
+ # return txt[start:end+1].replace("\n"," ")
89
 
90
+ # # crawl data
91
 
92
+ # def extract_tag(txt,tagname):
93
+ # return txt[txt.find("<"+tagname+">")+len(tagname)+2:txt.find("</"+tagname+">")]
94
 
95
+ # def get_record(extract):
96
+ # id = extract_tag(extract,"id")
97
+ # updated = extract_tag(extract,"updated")
98
+ # published = extract_tag(extract,"published")
99
+ # title = extract_tag(extract,"title").replace("\n ","").strip()
100
+ # summary = extract_tag(extract,"summary").replace("\n","").strip()
101
+ # authors = []
102
+ # while extract.find("<author>")!=-1:
103
+ # author = extract_tag(extract,"name")
104
+ # extract = extract[extract.find("</author>")+9:]
105
+ # authors.append(author)
106
+ # pattern = '<link title="pdf" href="'
107
+ # link_start = extract.find('<link title="pdf" href="')
108
+ # link = extract[link_start+len(pattern):extract.find("rel=",link_start)-2]
109
+ # return [id, updated, published, title, authors, link, summary]
110
 
111
+ # def crawl_exact_paper(title,author,max_results=3):
112
+ # authors = authors_list_to_str(author)
113
+ # records = []
114
+ # url = 'http://export.arxiv.org/api/query?search_query=ti:{title}+AND+au:{author}&max_results={max_results}'.format(title=title,author=authors,max_results=max_results)
115
+ # url = url.replace(" ","%20")
116
+ # try:
117
+ # arxiv_page = urllib.request.urlopen(url,timeout=100).read()
118
+ # xml = str(arxiv_page,encoding="utf-8")
119
+ # while xml.find("<entry>") != -1:
120
+ # extract = xml[xml.find("<entry>")+7:xml.find("</entry>")]
121
+ # xml = xml[xml.find("</entry>")+8:]
122
+ # extract = get_record(extract)
123
+ # topic = choose_topic(extract[6])
124
+ # records.append([topic,*extract])
125
+ # return records
126
+ # except Exception as e:
127
+ # return "Error: "+str(e)
128
 
129
+ # def crawl_arxiv(keyword_list, max_results=100):
130
+ # baseurl = 'http://export.arxiv.org/api/query?search_query='
131
+ # records = []
132
+ # for i,keyword in enumerate(keyword_list):
133
+ # if i ==0:
134
+ # url = baseurl + 'all:' + keyword
135
+ # else:
136
+ # url = url + '+OR+' + 'all:' + keyword
137
+ # url = url+ '&max_results=' + str(max_results)
138
+ # url = url.replace(' ', '%20')
139
+ # try:
140
+ # arxiv_page = urllib.request.urlopen(url,timeout=100).read()
141
+ # xml = str(arxiv_page,encoding="utf-8")
142
+ # while xml.find("<entry>") != -1:
143
+ # extract = xml[xml.find("<entry>")+7:xml.find("</entry>")]
144
+ # xml = xml[xml.find("</entry>")+8:]
145
+ # extract = get_record(extract)
146
+ # topic = choose_topic(extract[6])
147
+ # records.append([topic,*extract])
148
+ # return records
149
+ # except Exception as e:
150
+ # return "Error: "+str(e)
151
 
152
+ # class ArxivSQL:
153
+ # def __init__(self, table="arxivsql", name="db.sqlite3"):
154
+ # self.con = sqlite3.connect(name)
155
+ # self.cur = self.con.cursor()
156
+ # self.table = table
157
 
158
+ # def query(self, title="", author=[]):
159
+ # if len(title)>0:
160
+ # query_title = 'title like "%{}%"'.format(title)
161
+ # else:
162
+ # query_title = "True"
163
+ # if len(author)>0:
164
+ # query_author = 'authors like '
165
+ # for auth in author:
166
+ # query_author += "'%{}%' or ".format(auth)
167
+ # query_author = query_author[:-4]
168
+ # else:
169
+ # query_author = "True"
170
+ # query = "select * from {} where {} and {}".format(self.table,query_title,query_author)
171
+ # result = self.cur.execute(query)
172
+ # return result.fetchall()
173
 
174
+ # def query_id(self, ids=[]):
175
+ # try:
176
+ # if len(ids) == 0:
177
+ # return None
178
+ # query = "select * from {} where id in (".format(self.table)
179
+ # for id in ids:
180
+ # query+="'"+id+"',"
181
+ # query = query[:-1] + ")"
182
+ # result = self.cur.execute(query)
183
+ # return result.fetchall()
184
+ # except Exception as e:
185
+ # print(e)
186
+ # print("Error query: ",query)
187
 
188
+ # def add(self, crawl_records):
189
+ # """
190
+ # Add crawl_records (list) obtained from arxiv_crawlers
191
+ # A record is a list of 8 columns:
192
+ # [topic, id, updated, published, title, author, link, summary]
193
+ # Return the final length of the database table
194
+ # """
195
+ # results = ""
196
+ # for record in crawl_records:
197
+ # try:
198
+ # query = """insert into arxivsql values("{}","{}","{}","{}","{}","{}","{}")""".format(
199
+ # record[1][21:],
200
+ # record[0],
201
+ # record[4].replace('"',"'"),
202
+ # authors_list_to_str(record[5]),
203
+ # record[2][:10],
204
+ # record[3][:10],
205
+ # record[6]
206
+ # )
207
+ # self.cur.execute(query)
208
+ # self.con.commit()
209
+ # except Exception as e:
210
+ # result+=str(e)
211
+ # result+="\n" + query + "\n"
212
+ # finally:
213
+ # return results
214
 
215
+ # # instance of ArxivSQL
216
+ # sqldb = ArxivSQL()
217
 
218
+ # class ArxivChroma:
219
+ # """
220
+ # Create an interface to arxivdb, which only support query and addition.
221
+ # This interface do not support edition and deletion procedures.
222
+ # """
223
+ # def __init__(self, table="arxiv_records", name="arxivdb/"):
224
+ # self.client = chromadb.PersistentClient(name)
225
+ # self.model = embedding_model
226
+ # self.collection = self.client.get_or_create_collection(table,
227
+ # embedding_function=JinaAIEmbeddingFunction(
228
+ # model = self.model
229
+ # ))
230
 
231
+ # def query_relevant(self, keywords, query_texts, n_results=3):
232
+ # """
233
+ # Perform a query using a list of keywords (str),
234
+ # or using a relavant string
235
+ # """
236
+ # contains = []
237
+ # for keyword in keywords:
238
+ # contains.append({"$contains":keyword.lower()})
239
+ # return self.collection.query(
240
+ # query_texts=query_texts,
241
+ # where_document={
242
+ # "$or":contains
243
+ # },
244
+ # n_results=n_results,
245
+ # )
246
 
247
+ # def query_exact(self, id):
248
+ # ids = ["{}_{}".format(id,j) for j in range(0,10)]
249
+ # return self.collection.get(ids=ids)
250
 
251
+ # def add(self, crawl_records):
252
+ # """
253
+ # Add crawl_records (list) obtained from arxiv_crawlers
254
+ # A record is a list of 8 columns:
255
+ # [topic, id, updated, published, title, author, link, summary]
256
+ # Return the final length of the database table
257
+ # """
258
+ # for record in crawl_records:
259
+ # embed_text = """
260
+ # Topic: {},
261
+ # Title: {},
262
+ # Summary: {}
263
+ # """.format(record[0],record[4],record[7])
264
+ # chunks = chunk_texts(embed_text)
265
+ # ids = [record[1][21:]+"_"+str(j) for j in range(len(chunks))]
266
+ # paper_ids = [{"paper_id":record[1][21:]} for _ in range(len(chunks))]
267
+ # self.collection.add(
268
+ # documents = chunks,
269
+ # metadatas=paper_ids,
270
+ # ids = ids
271
+ # )
272
+ # return self.collection.count()
273
 
274
+ # # instance of ArxivChroma
275
+ # db = ArxivChroma()
276
 
chat/arxiv_bot/arxiv_bot_utils2.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+ from chromadb import Documents, EmbeddingFunction, Embeddings
3
+ from transformers import AutoModel
4
+ import json
5
+ from numpy.linalg import norm
6
+ import sqlite3
7
+ import urllib
8
+ from django.conf import settings
9
+ import Levenshtein
10
+
11
+ # this module act as a singleton class
12
+
13
+ class JinaAIEmbeddingFunction(EmbeddingFunction):
14
+ def __init__(self, model):
15
+ super().__init__()
16
+ self.model = model
17
+
18
+ def __call__(self, input: Documents) -> Embeddings:
19
+ embeddings = self.model.encode(input)
20
+ return embeddings.tolist()
21
+
22
+ # instance of embedding_model
23
+ embedding_model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en',
24
+ trust_remote_code=True,
25
+ cache_dir='models')
26
+
27
+ # instance of JinaAIEmbeddingFunction
28
+ ef = JinaAIEmbeddingFunction(embedding_model)
29
+
30
+ # list of topics
31
+ topic_descriptions = json.load(open("topic_descriptions.txt"))
32
+ topics = list(dict.keys(topic_descriptions))
33
+ embeddings = [embedding_model.encode(topic_descriptions[key]) for key in topic_descriptions]
34
+ cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
35
+
36
+ def lev_sim(a,b): return Levenshtein.distance(a,b)
37
+
38
+ def choose_topic(summary):
39
+ embed = embedding_model.encode(summary)
40
+ topic = ""
41
+ max_sim = 0.
42
+ for i,key in enumerate(topics):
43
+ sim = cos_sim(embed,embeddings[i])
44
+ if sim > max_sim:
45
+ topic = key
46
+ max_sim = sim
47
+ return topic
48
+
49
+ def authors_list_to_str(authors):
50
+ """input a list of authors, return a string represent authors"""
51
+ text = ""
52
+ for author in authors:
53
+ text+=author+", "
54
+ return text[:-3]
55
+
56
+ def authors_str_to_list(string):
57
+ """input a string of authors, return a list of authors"""
58
+ authors = []
59
+ list_auth = string.split("and")
60
+ for author in list_auth:
61
+ if author != "et al.":
62
+ authors.append(author.strip())
63
+ return authors
64
+
65
+ def chunk_texts(text, max_char=400):
66
+ """
67
+ Chunk a long text into several chunks, with each chunk about 300-400 characters long,
68
+ but make sure no word is cut in half.
69
+ Args:
70
+ text: The long text to be chunked.
71
+ max_char: The maximum number of characters per chunk (default: 400).
72
+ Returns:
73
+ A list of chunks.
74
+ """
75
+ chunks = []
76
+ current_chunk = ""
77
+ words = text.split()
78
+ for word in words:
79
+ if len(current_chunk) + len(word) + 1 >= max_char:
80
+ chunks.append(current_chunk)
81
+ current_chunk = " "
82
+ else:
83
+ current_chunk += " " + word
84
+ chunks.append(current_chunk.strip())
85
+ return chunks
86
+
87
+ def trimming(txt):
88
+ start = txt.find("{")
89
+ end = txt.rfind("}")
90
+ return txt[start:end+1].replace("\n"," ")
91
+
92
+ # crawl data
93
+
94
+ def extract_tag(txt,tagname):
95
+ return txt[txt.find("<"+tagname+">")+len(tagname)+2:txt.find("</"+tagname+">")]
96
+
97
+ def get_record(extract):
98
+ id = extract_tag(extract,"id")
99
+ updated = extract_tag(extract,"updated")
100
+ published = extract_tag(extract,"published")
101
+ title = extract_tag(extract,"title").replace("\n ","").strip()
102
+ summary = extract_tag(extract,"summary").replace("\n","").strip()
103
+ authors = []
104
+ while extract.find("<author>")!=-1:
105
+ author = extract_tag(extract,"name")
106
+ extract = extract[extract.find("</author>")+9:]
107
+ authors.append(author)
108
+ pattern = '<link title="pdf" href="'
109
+ link_start = extract.find('<link title="pdf" href="')
110
+ link = extract[link_start+len(pattern):extract.find("rel=",link_start)-2]
111
+ return [id, updated, published, title, authors, link, summary]
112
+
113
+ def crawl_exact_paper(title,author,max_results=3):
114
+ authors = authors_list_to_str(author)
115
+ records = []
116
+ url = 'http://export.arxiv.org/api/query?search_query=ti:{title}+AND+au:{author}&max_results={max_results}'.format(title=title,author=authors,max_results=max_results)
117
+ url = url.replace(" ","%20")
118
+ try:
119
+ arxiv_page = urllib.request.urlopen(url,timeout=100).read()
120
+ xml = str(arxiv_page,encoding="utf-8")
121
+ while xml.find("<entry>") != -1:
122
+ extract = xml[xml.find("<entry>")+7:xml.find("</entry>")]
123
+ xml = xml[xml.find("</entry>")+8:]
124
+ extract = get_record(extract)
125
+ topic = choose_topic(extract[6])
126
+ records.append([topic,*extract])
127
+ return records
128
+ except Exception as e:
129
+ return "Error: "+str(e)
130
+
131
+ def crawl_arxiv(keyword_list, max_results=100):
132
+ baseurl = 'http://export.arxiv.org/api/query?search_query='
133
+ records = []
134
+ for i,keyword in enumerate(keyword_list):
135
+ if i ==0:
136
+ url = baseurl + 'all:' + keyword
137
+ else:
138
+ url = url + '+OR+' + 'all:' + keyword
139
+ url = url+ '&max_results=' + str(max_results)
140
+ url = url.replace(' ', '%20')
141
+ try:
142
+ arxiv_page = urllib.request.urlopen(url,timeout=100).read()
143
+ xml = str(arxiv_page,encoding="utf-8")
144
+ while xml.find("<entry>") != -1:
145
+ extract = xml[xml.find("<entry>")+7:xml.find("</entry>")]
146
+ xml = xml[xml.find("</entry>")+8:]
147
+ extract = get_record(extract)
148
+ topic = choose_topic(extract[6])
149
+ records.append([topic,*extract])
150
+ return records
151
+ except Exception as e:
152
+ return "Error: "+str(e)
153
+
154
+ # This class act as a module
155
+ class ArxivChroma:
156
+ """
157
+ Create an interface to arxivdb, which only support query and addition.
158
+ This interface do not support edition and deletion procedures.
159
+ """
160
+ client = None
161
+ model = None
162
+ collection = None
163
+
164
+ @staticmethod
165
+ def connect(table="arxiv_records", name="arxivdb/"):
166
+ ArxivChroma.client = chromadb.PersistentClient(name)
167
+ ArxivChroma.model = embedding_model
168
+ ArxivChroma.collection = ArxivChroma.client.get_or_create_collection(table,
169
+ embedding_function=JinaAIEmbeddingFunction(
170
+ model = ArxivChroma.model
171
+ ))
172
+
173
+ @staticmethod
174
+ def query_relevant(keywords, query_texts, n_results=3):
175
+ """
176
+ Perform a query using a list of keywords (str),
177
+ or using a relavant string
178
+ """
179
+ contains = []
180
+ for keyword in keywords:
181
+ contains.append({"$contains":keyword.lower()})
182
+ return ArxivChroma.collection.query(
183
+ query_texts=query_texts,
184
+ where_document={
185
+ "$or":contains
186
+ },
187
+ n_results=n_results,
188
+ )
189
+
190
+ @staticmethod
191
+ def query_exact(id):
192
+ ids = ["{}_{}".format(id,j) for j in range(0,10)]
193
+ return ArxivChroma.collection.get(ids=ids)
194
+
195
+ @staticmethod
196
+ def add(crawl_records):
197
+ """
198
+ Add crawl_records (list) obtained from arxiv_crawlers
199
+ A record is a list of 8 columns:
200
+ [topic, id, updated, published, title, author, link, summary]
201
+ Return the final length of the database table
202
+ """
203
+ for record in crawl_records:
204
+ embed_text = """
205
+ Topic: {},
206
+ Title: {},
207
+ Summary: {}
208
+ """.format(record[0],record[4],record[7])
209
+ chunks = chunk_texts(embed_text)
210
+ ids = [record[1][21:]+"_"+str(j) for j in range(len(chunks))]
211
+ paper_ids = [{"paper_id":record[1][21:]} for _ in range(len(chunks))]
212
+ ArxivChroma.collection.add(
213
+ documents = chunks,
214
+ metadatas=paper_ids,
215
+ ids = ids
216
+ )
217
+ return ArxivChroma.collection.count()
218
+
219
+ @staticmethod
220
+ def close_connection():
221
+ pass
222
+
223
+ # This class act as a module
224
+ class ArxivSQL:
225
+ table = "arxivsql"
226
+ con = None
227
+ cur = None
228
+
229
+ @staticmethod
230
+ def connect(name="db.sqlite3"):
231
+ ArxivSQL.con = sqlite3.connect(name, check_same_thread=False)
232
+ ArxivSQL.cur = ArxivSQL.con.cursor()
233
+
234
+ @staticmethod
235
+ def query(title="", author=[], threshold = 15):
236
+ if len(author)>0:
237
+ query_author= " OR ".join([f"author LIKE '%{a}%'" for a in author])
238
+ else:
239
+ query_author= "True"
240
+ # Execute the query
241
+ query = f"select * from {ArxivSQL.table} where {query_author}"
242
+ results = ArxivSQL.cursor.execute(query).fetchall()
243
+ if len(title) == 0:
244
+ return results
245
+ else:
246
+ sim_score = {}
247
+ for row in results:
248
+ row_title = row[2]
249
+ row_id = row[0]
250
+ score = lev_sim(title, row_title)
251
+ if score < threshold:
252
+ sim_score[row_id] = score
253
+ sorted_results = sorted(sim_score.items(), key=lambda x: x[1])
254
+ return ArxivSQL.query_id(sorted_results)
255
+
256
+ @staticmethod
257
+ def query_id(ids=[]):
258
+ try:
259
+ if len(ids) == 0:
260
+ return None
261
+ query = "select * from {} where id in (".format(ArxivSQL.table)
262
+ for id in ids:
263
+ query+="'"+id+"',"
264
+ query = query[:-1] + ")"
265
+ result = ArxivSQL.cur.execute(query)
266
+ return result.fetchall()
267
+ except Exception as e:
268
+ print(e)
269
+ print("Error query: ",query)
270
+
271
+ @staticmethod
272
+ def add(crawl_records):
273
+ """
274
+ Add crawl_records (list) obtained from arxiv_crawlers
275
+ A record is a list of 8 columns:
276
+ [topic, id, updated, published, title, author, link, summary]
277
+ Return the final length of the database table
278
+ """
279
+ results = ""
280
+ for record in crawl_records:
281
+ try:
282
+ query = """insert into arxivsql values("{}","{}","{}","{}","{}","{}","{}")""".format(
283
+ record[1][21:],
284
+ record[0],
285
+ record[4].replace('"',"'"),
286
+ authors_list_to_str(record[5]),
287
+ record[2][:10],
288
+ record[3][:10],
289
+ record[6]
290
+ )
291
+ ArxivSQL.cur.execute(query)
292
+ ArxivSQL.con.commit()
293
+ except Exception as e:
294
+ results+=str(e)
295
+ results+="\n" + query + "\n"
296
+ finally:
297
+ return results
chat/arxiv_bot/prebuild.ipynb CHANGED
@@ -1,354 +1,354 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "metadata": {},
7
- "outputs": [
8
- {
9
- "name": "stderr",
10
- "output_type": "stream",
11
- "text": [
12
- "d:\\Program\\Anaconda\\envs\\python_project\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
- " from .autonotebook import tqdm as notebook_tqdm\n",
14
- "d:\\Program\\Anaconda\\envs\\python_project\\lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
15
- " warnings.warn(\n",
16
- "d:\\Program\\Anaconda\\envs\\python_project\\lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
17
- " warnings.warn(\n"
18
- ]
19
- }
20
- ],
21
- "source": [
22
- "import google.generativeai as genai\n",
23
- "import arxiv_bot_utils as utils\n",
24
- "import os\n",
25
- "from getpass import getpass\n",
26
- "import json\n",
27
- "#chỉ là import một cách bình thường\n",
28
- "#nội dung là "
29
- ]
30
- },
31
- {
32
- "cell_type": "code",
33
- "execution_count": 2,
34
- "metadata": {},
35
- "outputs": [
36
- {
37
- "name": "stdout",
38
- "output_type": "stream",
39
- "text": [
40
- "models/gemini-1.0-pro\n",
41
- "models/gemini-1.0-pro-001\n",
42
- "models/gemini-1.0-pro-latest\n",
43
- "models/gemini-1.0-pro-vision-latest\n",
44
- "models/gemini-1.5-pro-latest\n",
45
- "models/gemini-pro\n",
46
- "models/gemini-pro-vision\n"
47
- ]
48
- }
49
- ],
50
- "source": [
51
- "os.environ['GEMINI_API_KEY'] = getpass(\"Input your API key: \")\n",
52
- "# gán biến môi trường luôn\n",
53
- "gemini_api_key = os.getenv(\"GEMINI_API_KEY\") # string trong môi trường\n",
54
- "if not gemini_api_key:\n",
55
- " raise ValueError(\n",
56
- " \"Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable\"\n",
57
- " )\n",
58
- "genai.configure(api_key=gemini_api_key)\n",
59
- "for m in genai.list_models():\n",
60
- " if 'generateContent' in m.supported_generation_methods:\n",
61
- " print(m.name)\n",
62
- " #models nằm trên máy chủ\n"
63
- ]
64
- },
65
- {
66
- "cell_type": "code",
67
- "execution_count": 3,
68
- "metadata": {},
69
- "outputs": [],
70
- "source": [
71
- "config = genai.GenerationConfig(max_output_tokens=2048,\n",
72
- " temperature=0.7)\n",
73
- "safety_settings = [\n",
74
- " {\n",
75
- " \"category\": \"HARM_CATEGORY_DANGEROUS\",\n",
76
- " \"threshold\": \"BLOCK_NONE\",\n",
77
- " },\n",
78
- " {\n",
79
- " \"category\": \"HARM_CATEGORY_HARASSMENT\",\n",
80
- " \"threshold\": \"BLOCK_NONE\",\n",
81
- " },\n",
82
- " {\n",
83
- " \"category\": \"HARM_CATEGORY_HATE_SPEECH\",\n",
84
- " \"threshold\": \"BLOCK_NONE\",\n",
85
- " },\n",
86
- " {\n",
87
- " \"category\": \"HARM_CATEGORY_SEXUALLY_EXPLICIT\",\n",
88
- " \"threshold\": \"BLOCK_NONE\",\n",
89
- " },\n",
90
- " {\n",
91
- " \"category\": \"HARM_CATEGORY_DANGEROUS_CONTENT\",\n",
92
- " \"threshold\": \"BLOCK_NONE\",\n",
93
- " },\n",
94
- "]\n",
95
- "model = genai.GenerativeModel(\"gemini-pro\",\n",
96
- " generation_config=config,\n",
97
- " safety_settings=safety_settings)"
98
- ]
99
- },
100
- {
101
- "cell_type": "code",
102
- "execution_count": 4,
103
- "metadata": {},
104
- "outputs": [],
105
- "source": [
106
- "def extract_keyword_prompt(query):\n",
107
- " \"\"\"A prompt that return a JSON block as arguments for querying database\"\"\"\n",
108
- "\n",
109
- " prompt = (\n",
110
- " \"\"\"[INST] SYSTEM: You are an assistant that choose only one action below based on guest question.\n",
111
- " 1. If the guest question is asking for a single specific document or article with explicit title, you need to respond the information in JSON format with 2 keys \"title\", \"author\" if found any above. The authors are separated with the word 'and'. \n",
112
- " 2. If the guest question is asking for relevant informations about a topic, you need to respond the information in JSON format with 2 keys \"keywords\", \"description\", include a list of keywords represent the main academic topic, \\\n",
113
- " and a description about the main topic. You may paraphrase the keywords to add more. \\\n",
114
- " 3. If the guest is not asking for any informations or documents, you need to respond with a polite answer in JSON format with 1 key \"answer\".\n",
115
- " QUESTION: '{query}'\n",
116
- " [/INST]\n",
117
- " ANSWER: \n",
118
- " \"\"\"\n",
119
- " ).format(query=query)\n",
120
- "\n",
121
- " return prompt\n",
122
- "\n",
123
- "def make_answer_prompt(input, contexts):\n",
124
- " \"\"\"A prompt that return the final answer, based on the queried context\"\"\"\n",
125
- "\n",
126
- " prompt = (\n",
127
- " \"\"\"[INST] You are a library assistant that help to search articles and documents based on user's question.\n",
128
- " From guest's question, you have found some records and documents that may help. Now you need to answer the guest with the information found.\n",
129
- " If no information found in the database, you may generate some other recommendation related to user's question using your own knowledge. Each article or paper must have a link to the pdf download page.\n",
130
- " You should answer in a conversational form politely.\n",
131
- " QUESTION: '{input}'\n",
132
- " INFORMATION: '{contexts}'\n",
133
- " [/INST]\n",
134
- " ANSWER:\n",
135
- " \"\"\"\n",
136
- " ).format(input=input, contexts=contexts)\n",
137
- "\n",
138
- " return prompt"
139
- ]
140
- },
141
- {
142
- "cell_type": "code",
143
- "execution_count": 5,
144
- "metadata": {},
145
- "outputs": [],
146
- "source": [
147
- "def response(args):\n",
148
- " \"\"\"Create response context, based on input arguments\"\"\"\n",
149
- " keys = list(dict.keys(args))\n",
150
- " if \"answer\" in keys:\n",
151
- " return args['answer'], None # trả lời trực tiếp\n",
152
- " \n",
153
- " if \"keywords\" in keys:\n",
154
- " # perform query\n",
155
- " query_texts = args[\"description\"]\n",
156
- " keywords = args[\"keywords\"]\n",
157
- " results = utils.db.query_relevant(keywords=keywords, query_texts=query_texts)\n",
158
- " # print(results)\n",
159
- " ids = results['metadatas'][0]\n",
160
- " if len(ids) == 0:\n",
161
- " # go crawl some\n",
162
- " new_records = utils.crawl_arxiv(keyword_list=keywords, max_results=10)\n",
163
- " print(\"Got new records: \",len(new_records))\n",
164
- " if type(new_records) == str:\n",
165
- " return \"Error occured, information not found\", new_records\n",
166
- " utils.db.add(new_records)\n",
167
- " utils.sqldb.add(new_records)\n",
168
- " results = utils.db.query_relevant(keywords=keywords, query_texts=query_texts)\n",
169
- " ids = results['metadatas'][0]\n",
170
- " print(\"Re-queried on chromadb, results: \",ids)\n",
171
- " paper_id = [id['paper_id'] for id in ids]\n",
172
- " paper_info = utils.sqldb.query_id(paper_id)\n",
173
- " print(paper_info)\n",
174
- " records = [] # get title (2), author (3), link (6)\n",
175
- " result_string = \"\"\n",
176
- " if paper_info:\n",
177
- " for i in range(len(paper_info)):\n",
178
- " result_string += \"Title: {}, Author: {}, Link: {}\".format(paper_info[i][2],paper_info[i][3],paper_info[i][6])\n",
179
- " records.append([paper_info[i][2],paper_info[i][3],paper_info[i][6]])\n",
180
- " return result_string, records\n",
181
- " else:\n",
182
- " return \"Information not found\", \"Information not found\"\n",
183
- " # invoke llm and return result\n",
184
- "\n",
185
- " if \"title\" in keys:\n",
186
- " title = args['title']\n",
187
- " authors = utils.authors_str_to_list(args['author'])\n",
188
- " paper_info = utils.sqldb.query(title = title,author = authors)\n",
189
- " # if query not found then go crawl brh\n",
190
- " # print(paper_info)\n",
191
- "\n",
192
- " if len(paper_info) == 0:\n",
193
- " new_records = utils.crawl_exact_paper(title=title,author=authors)\n",
194
- " print(\"Got new records: \",len(new_records))\n",
195
- " if type(new_records) == str:\n",
196
- " # print(new_records)\n",
197
- " return \"Error occured, information not found\", \"Information not found\"\n",
198
- " utils.db.add(new_records)\n",
199
- " utils.sqldb.add(new_records)\n",
200
- " paper_info = utils.sqldb.query(title = title,author = authors)\n",
201
- " print(\"Re-queried on chromadb, results: \",paper_info)\n",
202
- " # -------------------------------------\n",
203
- " records = [] # get title (2), author (3), link (6)\n",
204
- " result_string = \"\"\n",
205
- " for i in range(len(paper_info)):\n",
206
- " result_string += \"Title: {}, Author: {}, Link: {}\".format(paper_info[i][2],paper_info[i][3],paper_info[i][6])\n",
207
- " records.append([paper_info[i][2],paper_info[i][3],paper_info[i][6]])\n",
208
- " # process results:\n",
209
- " if len(result_string) == 0:\n",
210
- " return \"Information not found\", \"Information not found\"\n",
211
- " return result_string, records\n",
212
- " # invoke llm and return result"
213
- ]
214
- },
215
- {
216
- "cell_type": "code",
217
- "execution_count": 6,
218
- "metadata": {},
219
- "outputs": [],
220
- "source": [
221
- "def full_chain_single_question(input_prompt):\n",
222
- " try:\n",
223
- " first_prompt = extract_keyword_prompt(input_prompt)\n",
224
- " temp_answer = model.generate_content(first_prompt).text\n",
225
- "\n",
226
- " args = json.loads(utils.trimming(temp_answer))\n",
227
- " contexts, results = response(args)\n",
228
- " if not results:\n",
229
- " print(contexts)\n",
230
- " else:\n",
231
- " output_prompt = make_answer_prompt(input_prompt,contexts)\n",
232
- " answer = model.generate_content(output_prompt).text\n",
233
- " return temp_answer, answer\n",
234
- " except Exception as e:\n",
235
- " print(e)\n",
236
- " return temp_answer, \"Error occured: \" + str(e)"
237
- ]
238
- },
239
- {
240
- "cell_type": "code",
241
- "execution_count": 27,
242
- "metadata": {},
243
- "outputs": [
244
- {
245
- "name": "stdout",
246
- "output_type": "stream",
247
- "text": [
248
- "[('1903.04824v1', 'computer science', 'Proceedings of the Fifth International Conference on Cloud and Robotics (ICCR2018)', ' Huaxi, Zhang, Jacques Malenfan', '2019-03-12', '2019-03-12', 'http://arxiv.org/pdf/1903.04824v1'), ('1709.07597v1', 'economics', 'Inverse Reinforcement Learning with Conditional Choice Probabilities', 'Mohit Sharma, Kris M. Kitani, Joachim Groege', '2017-09-22', '2017-09-22', 'http://arxiv.org/pdf/1709.07597v1')]\n",
249
- "Sure, here are some key papers on model predictive control for nonlinear systems:\n",
250
- "\n",
251
- "* **Nonlinear Model Predictive Control: A Survey** by Garcia, P.D., Prett, D.M., and Morari, M. (1989)\n",
252
- "* **Model Predictive Control for Nonlinear Systems** by Camacho, E.F. and Bordons, C. (1999)\n",
253
- "* **Nonlinear Model Predictive Control** by Rawlings, J.B. and Mayne, D.Q. (2009)\n",
254
- "\n",
255
- "As for recent reviews on the application of control theory to robotics, here are a few:\n",
256
- "\n",
257
- "* **Control of Robot Manipulators** by Spong, M.W., Hutchinson, S., and Vidyasagar, M. (2006)\n",
258
- "* **Robotics: Modelling, Planning and Control** by Siciliano, B., Sciavicco, L., Villani, L., and Oriolo, G. (2010)\n",
259
- "* **Control of Robot Arms** by Featherstone, R. (2014)\n",
260
- "\n",
261
- "I hope this information is helpful. Please let me know if you have any other questions.\n"
262
- ]
263
- }
264
- ],
265
- "source": [
266
- "# test response, second step\n",
267
- "input_prompt = \"Can you suggest some key papers on model predictive control for nonlinear systems, and are there any recent reviews on the application of control theory to robotics?\"\n",
268
- "args = \"{\\n \\\"keywords\\\": [\\\"Model predictive control\\\", \\\"Nonlinear systems\\\", \\\"Robotics\\\", \\\"Control theory\\\"],\\n \\\"description\\\": \\\"Model predictive control (MPC) is a control algorithm that uses a model of the system to predict future behavior and optimize the control inputs. MPC is particularly well-suited for nonlinear systems, as it can handle the complex dynamics of these systems. In recent years, MPC has been increasingly applied to robotics, as it can improve the performance and safety of robotic systems. Control theory is a branch of mathematics that deals with the analysis and design of control systems. Control theory has been applied to a wide range of problems in robotics, including motion planning, trajectory tracking, and force control.\\\"\\n}\"\n",
269
- "args = json.loads(args)\n",
270
- "contexts, results = response(args)\n",
271
- "if not results:\n",
272
- " # direct answer\n",
273
- " print(contexts)\n",
274
- "else:\n",
275
- " output_prompt = make_answer_prompt(input_prompt,contexts)\n",
276
- " answer = model.generate_content(output_prompt).text\n",
277
- " print(answer)"
278
- ]
279
- },
280
- {
281
- "cell_type": "code",
282
- "execution_count": 7,
283
- "metadata": {},
284
- "outputs": [
285
- {
286
- "name": "stdout",
287
- "output_type": "stream",
288
- "text": [
289
- "{'desired': 'Natural Language Processing (Computer Science)', 'question': 'What are some recent papers on deep learning architectures for text classification, and can you recommend any surveys or reviews on the topic?'}\n",
290
- "0\n",
291
- "[('1808.08121v1', 'computer science', 'An Improvement of Data Classification Using Random Multimodel Deep Learning (RMDL)', 'Mojtaba Heidarysafa, Kamran Kowsari, Donald E. Brown, Kiana Jafari Meimandi, Laura E. Barne', '2018-08-23', '2018-08-23', 'http://arxiv.org/pdf/1808.08121v1'), ('1904.08067v5', 'computer science', 'Text Classification Algorithms: A Survey', 'Kamran Kowsari, Kiana Jafari Meimandi, Mojtaba Heidarysafa, Sanjana Mendu, Laura E. Barnes, Donald E. Brow', '2020-05-20', '2019-04-17', 'http://arxiv.org/pdf/1904.08067v5'), ('2202.09144v1', 'computer science', 'Modelling the semantics of text in complex document layouts using graph transformer networks', 'Thomas Roland Barillot, Jacob Saks, Polena Lilyanova, Edward Torgas, Yachen Hu, Yuanqing Liu, Varun Balupuri, Paul Gaskel', '2022-02-18', '2022-02-18', 'http://arxiv.org/pdf/2202.09144v1')]\n",
292
- "1\n",
293
- "[('1601.04187v1', 'computer science', 'Conversion of Artificial Recurrent Neural Networks to Spiking Neural Networks for Low-power Neuromorphic Hardware', 'Peter U. Diehl, Guido Zarrella, Andrew Cassidy, Bruno U. Pedroni, Emre Neftc', '2016-01-16', '2016-01-16', 'http://arxiv.org/pdf/1601.04187v1'), ('1801.01093v3', 'economics', 'Comparing the Forecasting Performances of Linear Models for Electricity Prices with High RES Penetration', 'Angelica Gianfreda, Francesco Ravazzolo, Luca Rossin', '2019-11-12', '2018-01-03', 'http://arxiv.org/pdf/1801.01093v3'), ('2302.11093v1', 'electrical engineering and system science', 'Use Cases for Time-Frequency Image Representations and Deep Learning Techniques for Improved Signal Classification', 'Mehmet Parla', '2023-02-22', '2023-02-22', 'http://arxiv.org/pdf/2302.11093v1')]\n",
294
- "2\n",
295
- "[('1505.07907v4', 'economics', 'Linking Economic Complexity, Institutions and Income Inequality', 'D. Hartmann, M. R. Guevara, C. Jara-Figueroa, M. Aristaran, C. A. Hidalg', '2017-01-04', '2015-05-29', 'http://arxiv.org/pdf/1505.07907v4'), ('2107.06855v2', 'economics', 'Comparing Intellectual property policy in the Global North and South -- A one-size-fits-all policy for economic prosperity?', 'S Sidhartha Narayan, Malavika Ranjan, Madhumitha Raghurama', '2021-08-10', '2021-07-14', 'http://arxiv.org/pdf/2107.06855v2'), ('1910.11780v1', 'economics', 'Inequality in Turkey: Looking Beyond Growth', 'Bayram Cakir, Ipek Ergu', '2019-10-25', '2019-10-25', 'http://arxiv.org/pdf/1910.11780v1')]\n",
296
- "3\n",
297
- "[('1607.06583v2', 'computer science', \"Classification of Alzheimer's Disease Structural MRI Data by Deep Learning Convolutional Neural Networks\", 'Saman Sarraf, Ghassem Tofigh', '2017-05-19', '2016-07-22', 'http://arxiv.org/pdf/1607.06583v2'), ('2101.10265v1', 'computer science', 'Superiorities of Deep Extreme Learning Machines against Convolutional Neural Networks', 'Gokhan Altan, Yakup Kutl', '2021-01-21', '2021-01-21', 'http://arxiv.org/pdf/2101.10265v1'), ('2208.03143v1', 'computer science', 'Deep Learning and Health Informatics for Smart Monitoring and Diagnosis', 'Amin Gasm', '2022-08-05', '2022-08-05', 'http://arxiv.org/pdf/2208.03143v1')]\n",
298
- "4\n",
299
- "[('2302.06584v3', 'computer science', 'Thermodynamic AI and the fluctuation frontier', 'Patrick J. Coles, Collin Szczepanski, Denis Melanson, Kaelan Donatella, Antonio J. Martinez, Faris Sbah', '2023-06-13', '2023-02-09', 'http://arxiv.org/pdf/2302.06584v3'), ('2307.12298v1', 'computer science', 'Stabilization and Dissipative Information Transfer of a Superconducting Kerr-Cat Qubit', 'Ufuk Korkmaz, Deniz Türkpenç', '2023-07-23', '2023-07-23', 'http://arxiv.org/pdf/2307.12298v1'), ('2106.10421v1', 'computer science', 'QFCNN: Quantum Fourier Convolutional Neural Network', 'Feihong Shen, Jun Li', '2021-06-19', '2021-06-19', 'http://arxiv.org/pdf/2106.10421v1')]\n",
300
- "5\n",
301
- "[('2308.16539v2', 'computer science', 'On a Connection between Differential Games, Optimal Control, and Energy-based Models for Multi-Agent Interactions', 'Christopher Diehl, Tobias Klosek, Martin Krüger, Nils Murzyn, Torsten Bertra', '2023-10-16', '2023-08-31', 'http://arxiv.org/pdf/2308.16539v2'), ('2404.12474v1', 'computer science', 'Learning a Stable, Safe, Distributed Feedback Controller for a Heterogeneous Platoon of Vehicles', 'Michael H. Shaham, Taskin Padi', '2024-04-18', '2024-04-18', 'http://arxiv.org/pdf/2404.12474v1'), ('2008.13221v1', 'computer science', 'Human-in-the-Loop Methods for Data-Driven and Reinforcement Learning Systems', 'Vinicius G. Goeck', '2020-08-30', '2020-08-30', 'http://arxiv.org/pdf/2008.13221v1')]\n",
302
- "6\n",
303
- "[('1911.06206v3', 'economics', 'Bayesian state-space modeling for analyzing heterogeneous network effects of US monetary policy', 'Niko Hauzenberger, Michael Pfarrhofe', '2020-09-10', '2019-11-14', 'http://arxiv.org/pdf/1911.06206v3'), ('2302.14114v1', 'economics', 'Econometric assessment of the monetary policy shocks in Morocco: Evidence from a Bayesian Factor-Augmented VAR', 'Marouane Daou', '2023-02-27', '2023-02-27', 'http://arxiv.org/pdf/2302.14114v1'), ('2311.11858v1', 'economics', 'Theory coherent shrinkage of Time-Varying Parameters in VARs', 'Andrea Renzett', '2023-11-20', '2023-11-20', 'http://arxiv.org/pdf/2311.11858v1')]\n",
304
- "7\n",
305
- "[('2310.03365v2', 'computer science', 'Swin-Tempo: Temporal-Aware Lung Nodule Detection in CT Scans as Video Sequences Using Swin Transformer-Enhanced UNet', 'Hossein Jafari, Karim Faez, Hamidreza Amindava', '2023-10-14', '2023-10-05', 'http://arxiv.org/pdf/2310.03365v2'), ('1808.08531v1', 'computer science', 'DeepTracker: Visualizing the Training Process of Convolutional Neural Networks', 'Dongyu Liu, Weiwei Cui, Kai Jin, Yuxiao Guo, Huamin Q', '2018-08-26', '2018-08-26', 'http://arxiv.org/pdf/1808.08531v1'), ('2105.10448v1', 'computer science', 'Distinguishing artefacts: evaluating the saturation point of convolutional neural networks', 'Ric Real, James Gopsill, David Jones, Chris Snider, Ben Hick', '2021-05-21', '2021-05-21', 'http://arxiv.org/pdf/2105.10448v1')]\n",
306
- "8\n",
307
- "Got new records: 10\n",
308
- "Re-queried on chromadb, results: []\n",
309
- "None\n",
310
- "9\n",
311
- "[('2403.07017v1', 'computer science', 'Mathematics of multi-agent learning systems at the interface of game theory and artificial intelligence', 'Long Wang, Feng Fu, Xingru Che', '2024-03-09', '2024-03-09', 'http://arxiv.org/pdf/2403.07017v1'), ('2210.02205v1', 'computer science', 'Game Theoretic Rating in N-player general-sum games with Equilibria', 'Luke Marris, Marc Lanctot, Ian Gemp, Shayegan Omidshafiei, Stephen McAleer, Jerome Connor, Karl Tuyls, Thore Graepe', '2022-10-05', '2022-10-05', 'http://arxiv.org/pdf/2210.02205v1'), ('2212.05357v3', 'economics', 'On Blockchain We Cooperate: An Evolutionary Game Perspective', 'Luyao Zhang, Xinyu Tia', '2023-01-19', '2022-12-10', 'http://arxiv.org/pdf/2212.05357v3')]\n"
312
- ]
313
- }
314
- ],
315
- "source": [
316
- "with open(\"test_questions.txt\",\"r\") as infile:\n",
317
- " data = json.load(infile)\n",
318
- "print(data[0])\n",
319
- "\n",
320
- "test_log = []\n",
321
- "for i,t in enumerate(data):\n",
322
- " print(i)\n",
323
- " temp_answer, answer = full_chain_single_question(t['question'])\n",
324
- " test_log.append({'desired topic':t['desired'],\n",
325
- " 'question':t['question'],\n",
326
- " 'first answer':temp_answer,\n",
327
- " 'final answer':answer})\n",
328
- "with open(\"test_results.json\",\"w\") as outfile:\n",
329
- " json.dump(test_log,outfile)"
330
- ]
331
- }
332
- ],
333
- "metadata": {
334
- "kernelspec": {
335
- "display_name": "Python 3",
336
- "language": "python",
337
- "name": "python3"
338
- },
339
- "language_info": {
340
- "codemirror_mode": {
341
- "name": "ipython",
342
- "version": 3
343
- },
344
- "file_extension": ".py",
345
- "mimetype": "text/x-python",
346
- "name": "python",
347
- "nbconvert_exporter": "python",
348
- "pygments_lexer": "ipython3",
349
- "version": "3.10.12"
350
- }
351
- },
352
- "nbformat": 4,
353
- "nbformat_minor": 2
354
- }
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "d:\\Program\\Anaconda\\envs\\python_project\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
+ " from .autonotebook import tqdm as notebook_tqdm\n",
14
+ "d:\\Program\\Anaconda\\envs\\python_project\\lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
15
+ " warnings.warn(\n",
16
+ "d:\\Program\\Anaconda\\envs\\python_project\\lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
17
+ " warnings.warn(\n"
18
+ ]
19
+ }
20
+ ],
21
+ "source": [
22
+ "import google.generativeai as genai\n",
23
+ "import arxiv_bot_utils as utils\n",
24
+ "import os\n",
25
+ "from getpass import getpass\n",
26
+ "import json\n",
27
+ "#chỉ là import một cách bình thường\n",
28
+ "#nội dung là "
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": 2,
34
+ "metadata": {},
35
+ "outputs": [
36
+ {
37
+ "name": "stdout",
38
+ "output_type": "stream",
39
+ "text": [
40
+ "models/gemini-1.0-pro\n",
41
+ "models/gemini-1.0-pro-001\n",
42
+ "models/gemini-1.0-pro-latest\n",
43
+ "models/gemini-1.0-pro-vision-latest\n",
44
+ "models/gemini-1.5-pro-latest\n",
45
+ "models/gemini-pro\n",
46
+ "models/gemini-pro-vision\n"
47
+ ]
48
+ }
49
+ ],
50
+ "source": [
51
+ "os.environ['GEMINI_API_KEY'] = getpass(\"Input your API key: \")\n",
52
+ "# gán biến môi trường luôn\n",
53
+ "gemini_api_key = os.getenv(\"GEMINI_API_KEY\") # string trong môi trường\n",
54
+ "if not gemini_api_key:\n",
55
+ " raise ValueError(\n",
56
+ " \"Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable\"\n",
57
+ " )\n",
58
+ "genai.configure(api_key=gemini_api_key)\n",
59
+ "for m in genai.list_models():\n",
60
+ " if 'generateContent' in m.supported_generation_methods:\n",
61
+ " print(m.name)\n",
62
+ " #models nằm trên máy chủ\n"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "code",
67
+ "execution_count": 3,
68
+ "metadata": {},
69
+ "outputs": [],
70
+ "source": [
71
+ "config = genai.GenerationConfig(max_output_tokens=2048,\n",
72
+ " temperature=0.7)\n",
73
+ "safety_settings = [\n",
74
+ " {\n",
75
+ " \"category\": \"HARM_CATEGORY_DANGEROUS\",\n",
76
+ " \"threshold\": \"BLOCK_NONE\",\n",
77
+ " },\n",
78
+ " {\n",
79
+ " \"category\": \"HARM_CATEGORY_HARASSMENT\",\n",
80
+ " \"threshold\": \"BLOCK_NONE\",\n",
81
+ " },\n",
82
+ " {\n",
83
+ " \"category\": \"HARM_CATEGORY_HATE_SPEECH\",\n",
84
+ " \"threshold\": \"BLOCK_NONE\",\n",
85
+ " },\n",
86
+ " {\n",
87
+ " \"category\": \"HARM_CATEGORY_SEXUALLY_EXPLICIT\",\n",
88
+ " \"threshold\": \"BLOCK_NONE\",\n",
89
+ " },\n",
90
+ " {\n",
91
+ " \"category\": \"HARM_CATEGORY_DANGEROUS_CONTENT\",\n",
92
+ " \"threshold\": \"BLOCK_NONE\",\n",
93
+ " },\n",
94
+ "]\n",
95
+ "model = genai.GenerativeModel(\"gemini-pro\",\n",
96
+ " generation_config=config,\n",
97
+ " safety_settings=safety_settings)"
98
+ ]
99
+ },
100
+ {
101
+ "cell_type": "code",
102
+ "execution_count": 4,
103
+ "metadata": {},
104
+ "outputs": [],
105
+ "source": [
106
+ "def extract_keyword_prompt(query):\n",
107
+ " \"\"\"A prompt that return a JSON block as arguments for querying database\"\"\"\n",
108
+ "\n",
109
+ " prompt = (\n",
110
+ " \"\"\"[INST] SYSTEM: You are an assistant that choose only one action below based on guest question.\n",
111
+ " 1. If the guest question is asking for a single specific document or article with explicit title, you need to respond the information in JSON format with 2 keys \"title\", \"author\" if found any above. The authors are separated with the word 'and'. \n",
112
+ " 2. If the guest question is asking for relevant informations about a topic, you need to respond the information in JSON format with 2 keys \"keywords\", \"description\", include a list of keywords represent the main academic topic, \\\n",
113
+ " and a description about the main topic. You may paraphrase the keywords to add more. \\\n",
114
+ " 3. If the guest is not asking for any informations or documents, you need to respond with a polite answer in JSON format with 1 key \"answer\".\n",
115
+ " QUESTION: '{query}'\n",
116
+ " [/INST]\n",
117
+ " ANSWER: \n",
118
+ " \"\"\"\n",
119
+ " ).format(query=query)\n",
120
+ "\n",
121
+ " return prompt\n",
122
+ "\n",
123
+ "def make_answer_prompt(input, contexts):\n",
124
+ " \"\"\"A prompt that return the final answer, based on the queried context\"\"\"\n",
125
+ "\n",
126
+ " prompt = (\n",
127
+ " \"\"\"[INST] You are a library assistant that help to search articles and documents based on user's question.\n",
128
+ " From guest's question, you have found some records and documents that may help. Now you need to answer the guest with the information found.\n",
129
+ " If no information found in the database, you may generate some other recommendation related to user's question using your own knowledge. Each article or paper must have a link to the pdf download page.\n",
130
+ " You should answer in a conversational form politely.\n",
131
+ " QUESTION: '{input}'\n",
132
+ " INFORMATION: '{contexts}'\n",
133
+ " [/INST]\n",
134
+ " ANSWER:\n",
135
+ " \"\"\"\n",
136
+ " ).format(input=input, contexts=contexts)\n",
137
+ "\n",
138
+ " return prompt"
139
+ ]
140
+ },
141
+ {
142
+ "cell_type": "code",
143
+ "execution_count": 5,
144
+ "metadata": {},
145
+ "outputs": [],
146
+ "source": [
147
+ "def response(args):\n",
148
+ " \"\"\"Create response context, based on input arguments\"\"\"\n",
149
+ " keys = list(dict.keys(args))\n",
150
+ " if \"answer\" in keys:\n",
151
+ " return args['answer'], None # trả lời trực tiếp\n",
152
+ " \n",
153
+ " if \"keywords\" in keys:\n",
154
+ " # perform query\n",
155
+ " query_texts = args[\"description\"]\n",
156
+ " keywords = args[\"keywords\"]\n",
157
+ " results = utils.db.query_relevant(keywords=keywords, query_texts=query_texts)\n",
158
+ " # print(results)\n",
159
+ " ids = results['metadatas'][0]\n",
160
+ " if len(ids) == 0:\n",
161
+ " # go crawl some\n",
162
+ " new_records = utils.crawl_arxiv(keyword_list=keywords, max_results=10)\n",
163
+ " print(\"Got new records: \",len(new_records))\n",
164
+ " if type(new_records) == str:\n",
165
+ " return \"Error occured, information not found\", new_records\n",
166
+ " utils.db.add(new_records)\n",
167
+ " utils.sqldb.add(new_records)\n",
168
+ " results = utils.db.query_relevant(keywords=keywords, query_texts=query_texts)\n",
169
+ " ids = results['metadatas'][0]\n",
170
+ " print(\"Re-queried on chromadb, results: \",ids)\n",
171
+ " paper_id = [id['paper_id'] for id in ids]\n",
172
+ " paper_info = utils.sqldb.query_id(paper_id)\n",
173
+ " print(paper_info)\n",
174
+ " records = [] # get title (2), author (3), link (6)\n",
175
+ " result_string = \"\"\n",
176
+ " if paper_info:\n",
177
+ " for i in range(len(paper_info)):\n",
178
+ " result_string += \"Title: {}, Author: {}, Link: {}\".format(paper_info[i][2],paper_info[i][3],paper_info[i][6])\n",
179
+ " records.append([paper_info[i][2],paper_info[i][3],paper_info[i][6]])\n",
180
+ " return result_string, records\n",
181
+ " else:\n",
182
+ " return \"Information not found\", \"Information not found\"\n",
183
+ " # invoke llm and return result\n",
184
+ "\n",
185
+ " if \"title\" in keys:\n",
186
+ " title = args['title']\n",
187
+ " authors = utils.authors_str_to_list(args['author'])\n",
188
+ " paper_info = utils.sqldb.query(title = title,author = authors)\n",
189
+ " # if query not found then go crawl brh\n",
190
+ " # print(paper_info)\n",
191
+ "\n",
192
+ " if len(paper_info) == 0:\n",
193
+ " new_records = utils.crawl_exact_paper(title=title,author=authors)\n",
194
+ " print(\"Got new records: \",len(new_records))\n",
195
+ " if type(new_records) == str:\n",
196
+ " # print(new_records)\n",
197
+ " return \"Error occured, information not found\", \"Information not found\"\n",
198
+ " utils.db.add(new_records)\n",
199
+ " utils.sqldb.add(new_records)\n",
200
+ " paper_info = utils.sqldb.query(title = title,author = authors)\n",
201
+ " print(\"Re-queried on chromadb, results: \",paper_info)\n",
202
+ " # -------------------------------------\n",
203
+ " records = [] # get title (2), author (3), link (6)\n",
204
+ " result_string = \"\"\n",
205
+ " for i in range(len(paper_info)):\n",
206
+ " result_string += \"Title: {}, Author: {}, Link: {}\".format(paper_info[i][2],paper_info[i][3],paper_info[i][6])\n",
207
+ " records.append([paper_info[i][2],paper_info[i][3],paper_info[i][6]])\n",
208
+ " # process results:\n",
209
+ " if len(result_string) == 0:\n",
210
+ " return \"Information not found\", \"Information not found\"\n",
211
+ " return result_string, records\n",
212
+ " # invoke llm and return result"
213
+ ]
214
+ },
215
+ {
216
+ "cell_type": "code",
217
+ "execution_count": 6,
218
+ "metadata": {},
219
+ "outputs": [],
220
+ "source": [
221
+ "def full_chain_single_question(input_prompt):\n",
222
+ " try:\n",
223
+ " first_prompt = extract_keyword_prompt(input_prompt)\n",
224
+ " temp_answer = model.generate_content(first_prompt).text\n",
225
+ "\n",
226
+ " args = json.loads(utils.trimming(temp_answer))\n",
227
+ " contexts, results = response(args)\n",
228
+ " if not results:\n",
229
+ " print(contexts)\n",
230
+ " else:\n",
231
+ " output_prompt = make_answer_prompt(input_prompt,contexts)\n",
232
+ " answer = model.generate_content(output_prompt).text\n",
233
+ " return temp_answer, answer\n",
234
+ " except Exception as e:\n",
235
+ " print(e)\n",
236
+ " return temp_answer, \"Error occured: \" + str(e)"
237
+ ]
238
+ },
239
+ {
240
+ "cell_type": "code",
241
+ "execution_count": 27,
242
+ "metadata": {},
243
+ "outputs": [
244
+ {
245
+ "name": "stdout",
246
+ "output_type": "stream",
247
+ "text": [
248
+ "[('1903.04824v1', 'computer science', 'Proceedings of the Fifth International Conference on Cloud and Robotics (ICCR2018)', ' Huaxi, Zhang, Jacques Malenfan', '2019-03-12', '2019-03-12', 'http://arxiv.org/pdf/1903.04824v1'), ('1709.07597v1', 'economics', 'Inverse Reinforcement Learning with Conditional Choice Probabilities', 'Mohit Sharma, Kris M. Kitani, Joachim Groege', '2017-09-22', '2017-09-22', 'http://arxiv.org/pdf/1709.07597v1')]\n",
249
+ "Sure, here are some key papers on model predictive control for nonlinear systems:\n",
250
+ "\n",
251
+ "* **Nonlinear Model Predictive Control: A Survey** by Garcia, P.D., Prett, D.M., and Morari, M. (1989)\n",
252
+ "* **Model Predictive Control for Nonlinear Systems** by Camacho, E.F. and Bordons, C. (1999)\n",
253
+ "* **Nonlinear Model Predictive Control** by Rawlings, J.B. and Mayne, D.Q. (2009)\n",
254
+ "\n",
255
+ "As for recent reviews on the application of control theory to robotics, here are a few:\n",
256
+ "\n",
257
+ "* **Control of Robot Manipulators** by Spong, M.W., Hutchinson, S., and Vidyasagar, M. (2006)\n",
258
+ "* **Robotics: Modelling, Planning and Control** by Siciliano, B., Sciavicco, L., Villani, L., and Oriolo, G. (2010)\n",
259
+ "* **Control of Robot Arms** by Featherstone, R. (2014)\n",
260
+ "\n",
261
+ "I hope this information is helpful. Please let me know if you have any other questions.\n"
262
+ ]
263
+ }
264
+ ],
265
+ "source": [
266
+ "# test response, second step\n",
267
+ "input_prompt = \"Can you suggest some key papers on model predictive control for nonlinear systems, and are there any recent reviews on the application of control theory to robotics?\"\n",
268
+ "args = \"{\\n \\\"keywords\\\": [\\\"Model predictive control\\\", \\\"Nonlinear systems\\\", \\\"Robotics\\\", \\\"Control theory\\\"],\\n \\\"description\\\": \\\"Model predictive control (MPC) is a control algorithm that uses a model of the system to predict future behavior and optimize the control inputs. MPC is particularly well-suited for nonlinear systems, as it can handle the complex dynamics of these systems. In recent years, MPC has been increasingly applied to robotics, as it can improve the performance and safety of robotic systems. Control theory is a branch of mathematics that deals with the analysis and design of control systems. Control theory has been applied to a wide range of problems in robotics, including motion planning, trajectory tracking, and force control.\\\"\\n}\"\n",
269
+ "args = json.loads(args)\n",
270
+ "contexts, results = response(args)\n",
271
+ "if not results:\n",
272
+ " # direct answer\n",
273
+ " print(contexts)\n",
274
+ "else:\n",
275
+ " output_prompt = make_answer_prompt(input_prompt,contexts)\n",
276
+ " answer = model.generate_content(output_prompt).text\n",
277
+ " print(answer)"
278
+ ]
279
+ },
280
+ {
281
+ "cell_type": "code",
282
+ "execution_count": 7,
283
+ "metadata": {},
284
+ "outputs": [
285
+ {
286
+ "name": "stdout",
287
+ "output_type": "stream",
288
+ "text": [
289
+ "{'desired': 'Natural Language Processing (Computer Science)', 'question': 'What are some recent papers on deep learning architectures for text classification, and can you recommend any surveys or reviews on the topic?'}\n",
290
+ "0\n",
291
+ "[('1808.08121v1', 'computer science', 'An Improvement of Data Classification Using Random Multimodel Deep Learning (RMDL)', 'Mojtaba Heidarysafa, Kamran Kowsari, Donald E. Brown, Kiana Jafari Meimandi, Laura E. Barne', '2018-08-23', '2018-08-23', 'http://arxiv.org/pdf/1808.08121v1'), ('1904.08067v5', 'computer science', 'Text Classification Algorithms: A Survey', 'Kamran Kowsari, Kiana Jafari Meimandi, Mojtaba Heidarysafa, Sanjana Mendu, Laura E. Barnes, Donald E. Brow', '2020-05-20', '2019-04-17', 'http://arxiv.org/pdf/1904.08067v5'), ('2202.09144v1', 'computer science', 'Modelling the semantics of text in complex document layouts using graph transformer networks', 'Thomas Roland Barillot, Jacob Saks, Polena Lilyanova, Edward Torgas, Yachen Hu, Yuanqing Liu, Varun Balupuri, Paul Gaskel', '2022-02-18', '2022-02-18', 'http://arxiv.org/pdf/2202.09144v1')]\n",
292
+ "1\n",
293
+ "[('1601.04187v1', 'computer science', 'Conversion of Artificial Recurrent Neural Networks to Spiking Neural Networks for Low-power Neuromorphic Hardware', 'Peter U. Diehl, Guido Zarrella, Andrew Cassidy, Bruno U. Pedroni, Emre Neftc', '2016-01-16', '2016-01-16', 'http://arxiv.org/pdf/1601.04187v1'), ('1801.01093v3', 'economics', 'Comparing the Forecasting Performances of Linear Models for Electricity Prices with High RES Penetration', 'Angelica Gianfreda, Francesco Ravazzolo, Luca Rossin', '2019-11-12', '2018-01-03', 'http://arxiv.org/pdf/1801.01093v3'), ('2302.11093v1', 'electrical engineering and system science', 'Use Cases for Time-Frequency Image Representations and Deep Learning Techniques for Improved Signal Classification', 'Mehmet Parla', '2023-02-22', '2023-02-22', 'http://arxiv.org/pdf/2302.11093v1')]\n",
294
+ "2\n",
295
+ "[('1505.07907v4', 'economics', 'Linking Economic Complexity, Institutions and Income Inequality', 'D. Hartmann, M. R. Guevara, C. Jara-Figueroa, M. Aristaran, C. A. Hidalg', '2017-01-04', '2015-05-29', 'http://arxiv.org/pdf/1505.07907v4'), ('2107.06855v2', 'economics', 'Comparing Intellectual property policy in the Global North and South -- A one-size-fits-all policy for economic prosperity?', 'S Sidhartha Narayan, Malavika Ranjan, Madhumitha Raghurama', '2021-08-10', '2021-07-14', 'http://arxiv.org/pdf/2107.06855v2'), ('1910.11780v1', 'economics', 'Inequality in Turkey: Looking Beyond Growth', 'Bayram Cakir, Ipek Ergu', '2019-10-25', '2019-10-25', 'http://arxiv.org/pdf/1910.11780v1')]\n",
296
+ "3\n",
297
+ "[('1607.06583v2', 'computer science', \"Classification of Alzheimer's Disease Structural MRI Data by Deep Learning Convolutional Neural Networks\", 'Saman Sarraf, Ghassem Tofigh', '2017-05-19', '2016-07-22', 'http://arxiv.org/pdf/1607.06583v2'), ('2101.10265v1', 'computer science', 'Superiorities of Deep Extreme Learning Machines against Convolutional Neural Networks', 'Gokhan Altan, Yakup Kutl', '2021-01-21', '2021-01-21', 'http://arxiv.org/pdf/2101.10265v1'), ('2208.03143v1', 'computer science', 'Deep Learning and Health Informatics for Smart Monitoring and Diagnosis', 'Amin Gasm', '2022-08-05', '2022-08-05', 'http://arxiv.org/pdf/2208.03143v1')]\n",
298
+ "4\n",
299
+ "[('2302.06584v3', 'computer science', 'Thermodynamic AI and the fluctuation frontier', 'Patrick J. Coles, Collin Szczepanski, Denis Melanson, Kaelan Donatella, Antonio J. Martinez, Faris Sbah', '2023-06-13', '2023-02-09', 'http://arxiv.org/pdf/2302.06584v3'), ('2307.12298v1', 'computer science', 'Stabilization and Dissipative Information Transfer of a Superconducting Kerr-Cat Qubit', 'Ufuk Korkmaz, Deniz Türkpenç', '2023-07-23', '2023-07-23', 'http://arxiv.org/pdf/2307.12298v1'), ('2106.10421v1', 'computer science', 'QFCNN: Quantum Fourier Convolutional Neural Network', 'Feihong Shen, Jun Li', '2021-06-19', '2021-06-19', 'http://arxiv.org/pdf/2106.10421v1')]\n",
300
+ "5\n",
301
+ "[('2308.16539v2', 'computer science', 'On a Connection between Differential Games, Optimal Control, and Energy-based Models for Multi-Agent Interactions', 'Christopher Diehl, Tobias Klosek, Martin Krüger, Nils Murzyn, Torsten Bertra', '2023-10-16', '2023-08-31', 'http://arxiv.org/pdf/2308.16539v2'), ('2404.12474v1', 'computer science', 'Learning a Stable, Safe, Distributed Feedback Controller for a Heterogeneous Platoon of Vehicles', 'Michael H. Shaham, Taskin Padi', '2024-04-18', '2024-04-18', 'http://arxiv.org/pdf/2404.12474v1'), ('2008.13221v1', 'computer science', 'Human-in-the-Loop Methods for Data-Driven and Reinforcement Learning Systems', 'Vinicius G. Goeck', '2020-08-30', '2020-08-30', 'http://arxiv.org/pdf/2008.13221v1')]\n",
302
+ "6\n",
303
+ "[('1911.06206v3', 'economics', 'Bayesian state-space modeling for analyzing heterogeneous network effects of US monetary policy', 'Niko Hauzenberger, Michael Pfarrhofe', '2020-09-10', '2019-11-14', 'http://arxiv.org/pdf/1911.06206v3'), ('2302.14114v1', 'economics', 'Econometric assessment of the monetary policy shocks in Morocco: Evidence from a Bayesian Factor-Augmented VAR', 'Marouane Daou', '2023-02-27', '2023-02-27', 'http://arxiv.org/pdf/2302.14114v1'), ('2311.11858v1', 'economics', 'Theory coherent shrinkage of Time-Varying Parameters in VARs', 'Andrea Renzett', '2023-11-20', '2023-11-20', 'http://arxiv.org/pdf/2311.11858v1')]\n",
304
+ "7\n",
305
+ "[('2310.03365v2', 'computer science', 'Swin-Tempo: Temporal-Aware Lung Nodule Detection in CT Scans as Video Sequences Using Swin Transformer-Enhanced UNet', 'Hossein Jafari, Karim Faez, Hamidreza Amindava', '2023-10-14', '2023-10-05', 'http://arxiv.org/pdf/2310.03365v2'), ('1808.08531v1', 'computer science', 'DeepTracker: Visualizing the Training Process of Convolutional Neural Networks', 'Dongyu Liu, Weiwei Cui, Kai Jin, Yuxiao Guo, Huamin Q', '2018-08-26', '2018-08-26', 'http://arxiv.org/pdf/1808.08531v1'), ('2105.10448v1', 'computer science', 'Distinguishing artefacts: evaluating the saturation point of convolutional neural networks', 'Ric Real, James Gopsill, David Jones, Chris Snider, Ben Hick', '2021-05-21', '2021-05-21', 'http://arxiv.org/pdf/2105.10448v1')]\n",
306
+ "8\n",
307
+ "Got new records: 10\n",
308
+ "Re-queried on chromadb, results: []\n",
309
+ "None\n",
310
+ "9\n",
311
+ "[('2403.07017v1', 'computer science', 'Mathematics of multi-agent learning systems at the interface of game theory and artificial intelligence', 'Long Wang, Feng Fu, Xingru Che', '2024-03-09', '2024-03-09', 'http://arxiv.org/pdf/2403.07017v1'), ('2210.02205v1', 'computer science', 'Game Theoretic Rating in N-player general-sum games with Equilibria', 'Luke Marris, Marc Lanctot, Ian Gemp, Shayegan Omidshafiei, Stephen McAleer, Jerome Connor, Karl Tuyls, Thore Graepe', '2022-10-05', '2022-10-05', 'http://arxiv.org/pdf/2210.02205v1'), ('2212.05357v3', 'economics', 'On Blockchain We Cooperate: An Evolutionary Game Perspective', 'Luyao Zhang, Xinyu Tia', '2023-01-19', '2022-12-10', 'http://arxiv.org/pdf/2212.05357v3')]\n"
312
+ ]
313
+ }
314
+ ],
315
+ "source": [
316
+ "with open(\"test_questions.txt\",\"r\") as infile:\n",
317
+ " data = json.load(infile)\n",
318
+ "print(data[0])\n",
319
+ "\n",
320
+ "test_log = []\n",
321
+ "for i,t in enumerate(data):\n",
322
+ " print(i)\n",
323
+ " temp_answer, answer = full_chain_single_question(t['question'])\n",
324
+ " test_log.append({'desired topic':t['desired'],\n",
325
+ " 'question':t['question'],\n",
326
+ " 'first answer':temp_answer,\n",
327
+ " 'final answer':answer})\n",
328
+ "with open(\"test_results.json\",\"w\") as outfile:\n",
329
+ " json.dump(test_log,outfile)"
330
+ ]
331
+ }
332
+ ],
333
+ "metadata": {
334
+ "kernelspec": {
335
+ "display_name": "Python 3",
336
+ "language": "python",
337
+ "name": "python3"
338
+ },
339
+ "language_info": {
340
+ "codemirror_mode": {
341
+ "name": "ipython",
342
+ "version": 3
343
+ },
344
+ "file_extension": ".py",
345
+ "mimetype": "text/x-python",
346
+ "name": "python",
347
+ "nbconvert_exporter": "python",
348
+ "pygments_lexer": "ipython3",
349
+ "version": "3.10.12"
350
+ }
351
+ },
352
+ "nbformat": 4,
353
+ "nbformat_minor": 2
354
+ }
chat/consumers.py CHANGED
@@ -1,21 +1,25 @@
1
  import json
2
- from . import model_manage as md
3
- from chat.arxiv_bot.arxiv_bot_utils import ArxivSQL
4
  from channels.generic.websocket import WebsocketConsumer
5
 
6
 
7
  class ChatConsumer(WebsocketConsumer):
8
  def connect(self):
9
  self.accept()
10
- self.db_instance = ArxivSQL()
11
 
12
  def disconnect(self, close_code):
 
13
  pass
 
14
  def receive(self, text_data):
15
  text_data_json = json.loads(text_data)
16
  message = text_data_json["messages"]
17
  print(message)
18
- record, messagee = md.full_chain_history_question(message, self.db_instance)
19
- print("First answer: ",record)
20
- self.send(text_data=json.dumps({"message": messagee}))
 
 
 
21
 
 
1
  import json
2
+ from . import model_manage2 as md
 
3
  from channels.generic.websocket import WebsocketConsumer
4
 
5
 
6
  class ChatConsumer(WebsocketConsumer):
7
  def connect(self):
8
  self.accept()
9
+ self.model, self.session = md.init_model("auto")
10
 
11
  def disconnect(self, close_code):
12
+ del self.model, self.session
13
  pass
14
+
15
  def receive(self, text_data):
16
  text_data_json = json.loads(text_data)
17
  message = text_data_json["messages"]
18
  print(message)
19
+ question = message[-1]['content']
20
+ response, history_state = md.full_chain_history_question(question, self.session, mode="auto")
21
+ # print("First answer: ",response)
22
+ print("Session history:")
23
+ md.print_history(history_state)
24
+ self.send(text_data=json.dumps({"message": response}))
25
 
chat/migrations/__pycache__/0001_initial.cpython-311.pyc ADDED
Binary file (1.01 kB). View file
 
chat/migrations/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (171 Bytes). View file
 
chat/model_manage.py CHANGED
@@ -1,271 +1,271 @@
1
- # my_app/model_manager.py
2
- import google.generativeai as genai
3
- import chat.arxiv_bot.arxiv_bot_utils as utils
4
- import json
5
 
6
- model = None
7
 
8
- model_retrieval = None
9
 
10
- model_answer = None
11
 
12
- RETRIEVAL_INSTRUCT = """You are an auto chatbot that response with only one action below based on user question.
13
- 1. If the guest question is asking about a science topic, you need to respond the information in JSON schema below:
14
- {
15
- "keywords": [a list of string keywords about the topic],
16
- "description": "a paragraph describing the topic in about 50 to 100 words"
17
- }
18
- 2. If the guest is not asking for any informations or documents, you need to respond in JSON schema below:
19
- {
20
- "answer": "your answer to the user question"
21
- }"""
22
 
23
- ANSWER_INSTRUCT = """You are a library assistant that help answering customer question based on the information given.
24
- You always answer in a conversational form naturally and politely.
25
- You must introduce all the records given, each must contain title, authors and the link to the pdf file."""
26
 
27
- def create_model():
28
- with open("apikey.txt","r") as apikey:
29
- key = apikey.readline()
30
- genai.configure(api_key=key)
31
- for m in genai.list_models():
32
- if 'generateContent' in m.supported_generation_methods:
33
- print(m.name)
34
- print("He was there")
35
- config = genai.GenerationConfig(max_output_tokens=2048,
36
- temperature=1.0)
37
- safety_settings = [
38
- {
39
- "category": "HARM_CATEGORY_DANGEROUS",
40
- "threshold": "BLOCK_NONE",
41
- },
42
- {
43
- "category": "HARM_CATEGORY_HARASSMENT",
44
- "threshold": "BLOCK_NONE",
45
- },
46
- {
47
- "category": "HARM_CATEGORY_HATE_SPEECH",
48
- "threshold": "BLOCK_NONE",
49
- },
50
- {
51
- "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
52
- "threshold": "BLOCK_NONE",
53
- },
54
- {
55
- "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
56
- "threshold": "BLOCK_NONE",
57
- },
58
- ]
59
- global model, model_retrieval, model_answer
60
- model = genai.GenerativeModel("gemini-1.5-pro-latest",
61
- generation_config=config,
62
- safety_settings=safety_settings)
63
- model_retrieval = genai.GenerativeModel("gemini-1.5-pro-latest",
64
- generation_config=config,
65
- safety_settings=safety_settings,
66
- system_instruction=RETRIEVAL_INSTRUCT)
67
- model_answer = genai.GenerativeModel("gemini-1.5-pro-latest",
68
- generation_config=config,
69
- safety_settings=safety_settings,
70
- system_instruction=ANSWER_INSTRUCT)
71
- return model, model_answer, model_retrieval
72
 
73
- def get_model():
74
- global model, model_answer, model_retrieval
75
- if model is None:
76
- # Khởi tạo model ở đây
77
- model, model_answer, model_retrieval = create_model() # Giả sử create_model là hàm tạo model của bạn
78
- return model, model_answer, model_retrieval
79
 
80
- def extract_keyword_prompt(query):
81
- """A prompt that return a JSON block as arguments for querying database"""
82
 
83
- prompt = """[INST] SYSTEM: You are an auto chatbot that response with only one action below based on user question.
84
- 1. If the guest question is asking about a science topic, you need to respond the information in JSON schema below:
85
- {
86
- "keywords": [a list of string keywords about the topic],
87
- "description": "a paragraph describing the topic in about 50 to 100 words"
88
- }
89
- 2. If the guest is not asking for any informations or documents, you need to respond in JSON schema below:
90
- {
91
- "answer": "your answer to the user question"
92
- }
93
- QUESTION: """ + query + """[/INST]
94
- ANSWER: """
95
- return prompt
96
 
97
- def make_answer_prompt(input, contexts):
98
- """A prompt that return the final answer, based on the queried context"""
99
 
100
- prompt = (
101
- """[INST] You are a library assistant that help answering customer QUESTION based on the INFORMATION given.
102
- You always answer in a conversational form naturally and politely.
103
- You must introduce all the records given, each must contain title, authors and the link to the pdf file.
104
- QUESTION: {input}
105
- INFORMATION: '{contexts}'
106
- [/INST]
107
- ANSWER:
108
- """
109
- ).format(input=input, contexts=contexts)
110
- return prompt
111
 
112
- def retrieval_chat_template(question):
113
- return {
114
- "role":"user",
115
- "parts":[f"QUESTION: {question} \n ANSWER:"]
116
- }
117
 
118
- def answer_chat_template(question, contexts):
119
- return {
120
- "role":"user",
121
- "parts":[f"QUESTION: {question} \n INFORMATION: {contexts} \n ANSWER:"]
122
- }
123
 
124
- def response(args, db_instance):
125
- """Create response context, based on input arguments"""
126
- keys = list(dict.keys(args))
127
- if "answer" in keys:
128
- return args['answer'], None # trả lời trực tiếp
129
 
130
- if "keywords" in keys:
131
- # perform query
132
- query_texts = args["description"]
133
- keywords = args["keywords"]
134
- results = utils.db.query_relevant(keywords=keywords, query_texts=query_texts)
135
- # print(results)
136
- ids = results['metadatas'][0]
137
- if len(ids) == 0:
138
- # go crawl some
139
- new_records = utils.crawl_arxiv(keyword_list=keywords, max_results=10)
140
- print("Got new records: ",len(new_records))
141
- if type(new_records) == str:
142
- return "Error occured, information not found", new_records
143
- utils.db.add(new_records)
144
- db_instance.add(new_records)
145
- results = utils.db.query_relevant(keywords=keywords, query_texts=query_texts)
146
- ids = results['metadatas'][0]
147
- print("Re-queried on chromadb, results: ",ids)
148
- paper_id = [id['paper_id'] for id in ids]
149
- paper_info = db_instance.query_id(paper_id)
150
- print(paper_info)
151
- records = [] # get title (2), author (3), link (6)
152
- result_string = ""
153
- if paper_info:
154
- for i in range(len(paper_info)):
155
- result_string += "Record no.{} - Title: {}, Author: {}, Link: {}, ".format(i+1,paper_info[i][2],paper_info[i][3],paper_info[i][6])
156
- id = paper_info[i][0]
157
- selected_document = utils.db.query_exact(id)["documents"]
158
- doc_str = "Summary:"
159
- for doc in selected_document:
160
- doc_str+= doc + " "
161
- result_string += doc_str
162
- records.append([paper_info[i][2],paper_info[i][3],paper_info[i][6]])
163
- return result_string, records
164
- else:
165
- return "Information not found", "Information not found"
166
- # invoke llm and return result
167
-
168
- # if "title" in keys:
169
- # title = args['title']
170
- # authors = utils.authors_str_to_list(args['author'])
171
- # paper_info = db_instance.query(title = title,author = authors)
172
- # # if query not found then go crawl brh
173
- # # print(paper_info)
174
-
175
- # if len(paper_info) == 0:
176
- # new_records = utils.crawl_exact_paper(title=title,author=authors)
177
- # print("Got new records: ",len(new_records))
178
- # if type(new_records) == str:
179
- # # print(new_records)
180
- # return "Error occured, information not found", "Information not found"
181
- # utils.db.add(new_records)
182
- # db_instance.add(new_records)
183
- # paper_info = db_instance.query(title = title,author = authors)
184
- # print("Re-queried on chromadb, results: ",paper_info)
185
- # # -------------------------------------
186
- # records = [] # get title (2), author (3), link (6)
187
- # result_string = ""
188
- # for i in range(len(paper_info)):
189
- # result_string += "Title: {}, Author: {}, Link: {}".format(paper_info[i][2],paper_info[i][3],paper_info[i][6])
190
- # records.append([paper_info[i][2],paper_info[i][3],paper_info[i][6]])
191
- # # process results:
192
- # if len(result_string) == 0:
193
- # return "Information not found", "Information not found"
194
- # return result_string, records
195
- # invoke llm and return result
196
-
197
- def full_chain_single_question(input_prompt, db_instance):
198
- try:
199
- first_prompt = extract_keyword_prompt(input_prompt)
200
- temp_answer = model.generate_content(first_prompt).text
201
 
202
- args = json.loads(utils.trimming(temp_answer))
203
- contexts, results = response(args, db_instance)
204
- if not results:
205
- # print(contexts)
206
- return "Random question, direct return", contexts
207
- else:
208
- output_prompt = make_answer_prompt(input_prompt,contexts)
209
- answer = model.generate_content(output_prompt).text
210
- return temp_answer, answer
211
- except Exception as e:
212
- # print(e)
213
- return temp_answer, "Error occured: " + str(e)
214
-
215
 
216
- def format_chat_history_from_web(chat_history: list):
217
- temp_chat = []
218
- for message in chat_history:
219
- temp_chat.append(
220
- {
221
- "role": message["role"],
222
- "parts": [message["content"]]
223
- }
224
- )
225
- return temp_chat
 
 
 
 
 
 
 
 
 
 
 
226
 
227
- # def full_chain_history_question(chat_history: list, db_instance):
228
  # try:
229
- # temp_chat = format_chat_history_from_web(chat_history)
230
- # print('Extracted temp chat: ',temp_chat)
231
- # first_prompt = extract_keyword_prompt(temp_chat[-1]["parts"][0])
232
  # temp_answer = model.generate_content(first_prompt).text
233
 
234
  # args = json.loads(utils.trimming(temp_answer))
235
  # contexts, results = response(args, db_instance)
236
- # print('Context extracted: ',contexts)
237
  # if not results:
 
238
  # return "Random question, direct return", contexts
239
  # else:
240
- # QA_Prompt = make_answer_prompt(temp_chat[-1]["parts"][0], contexts)
241
- # temp_chat[-1]["parts"] = QA_Prompt
242
- # print(temp_chat)
243
- # answer = model.generate_content(temp_chat).text
244
  # return temp_answer, answer
245
  # except Exception as e:
246
  # # print(e)
247
  # return temp_answer, "Error occured: " + str(e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
- def full_chain_history_question(chat_history: list, db_instance):
250
- try:
251
- temp_chat = format_chat_history_from_web(chat_history)
252
- question = temp_chat[-1]['parts'][0]
253
- first_answer = model_retrieval.generate_content(temp_chat).text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
 
255
- print(first_answer)
256
- args = json.loads(utils.trimming(first_answer))
257
 
258
- contexts, results = response(args, db_instance)
259
- if not results:
260
- return "Random question, direct return", contexts
261
- else:
262
- print('Context to answers: ',contexts)
263
- answer_chat = answer_chat_template(question, contexts)
264
- temp_chat[-1] = answer_chat
265
- answer = model_answer.generate_content(temp_chat).text
266
- return first_answer, answer
267
- except Exception as e:
268
- if first_answer:
269
- return first_answer, "Error occured: " + str(e)
270
- else:
271
- return "No answer", "Error occured: " + str(e)
 
1
+ # # my_app/model_manager.py
2
+ # import google.generativeai as genai
3
+ # import chat.arxiv_bot.arxiv_bot_utils as utils
4
+ # import json
5
 
6
+ # model = None
7
 
8
+ # model_retrieval = None
9
 
10
+ # model_answer = None
11
 
12
+ # RETRIEVAL_INSTRUCT = """You are an auto chatbot that response with only one action below based on user question.
13
+ # 1. If the guest question is asking about a science topic, you need to respond the information in JSON schema below:
14
+ # {
15
+ # "keywords": [a list of string keywords about the topic],
16
+ # "description": "a paragraph describing the topic in about 50 to 100 words"
17
+ # }
18
+ # 2. If the guest is not asking for any informations or documents, you need to respond in JSON schema below:
19
+ # {
20
+ # "answer": "your answer to the user question"
21
+ # }"""
22
 
23
+ # ANSWER_INSTRUCT = """You are a library assistant that help answering customer question based on the information given.
24
+ # You always answer in a conversational form naturally and politely.
25
+ # You must introduce all the records given, each must contain title, authors and the link to the pdf file."""
26
 
27
+ # def create_model():
28
+ # with open("apikey.txt","r") as apikey:
29
+ # key = apikey.readline()
30
+ # genai.configure(api_key=key)
31
+ # for m in genai.list_models():
32
+ # if 'generateContent' in m.supported_generation_methods:
33
+ # print(m.name)
34
+ # print("He was there")
35
+ # config = genai.GenerationConfig(max_output_tokens=2048,
36
+ # temperature=1.0)
37
+ # safety_settings = [
38
+ # {
39
+ # "category": "HARM_CATEGORY_DANGEROUS",
40
+ # "threshold": "BLOCK_NONE",
41
+ # },
42
+ # {
43
+ # "category": "HARM_CATEGORY_HARASSMENT",
44
+ # "threshold": "BLOCK_NONE",
45
+ # },
46
+ # {
47
+ # "category": "HARM_CATEGORY_HATE_SPEECH",
48
+ # "threshold": "BLOCK_NONE",
49
+ # },
50
+ # {
51
+ # "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
52
+ # "threshold": "BLOCK_NONE",
53
+ # },
54
+ # {
55
+ # "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
56
+ # "threshold": "BLOCK_NONE",
57
+ # },
58
+ # ]
59
+ # global model, model_retrieval, model_answer
60
+ # model = genai.GenerativeModel("gemini-1.5-pro-latest",
61
+ # generation_config=config,
62
+ # safety_settings=safety_settings)
63
+ # model_retrieval = genai.GenerativeModel("gemini-1.5-pro-latest",
64
+ # generation_config=config,
65
+ # safety_settings=safety_settings,
66
+ # system_instruction=RETRIEVAL_INSTRUCT)
67
+ # model_answer = genai.GenerativeModel("gemini-1.5-pro-latest",
68
+ # generation_config=config,
69
+ # safety_settings=safety_settings,
70
+ # system_instruction=ANSWER_INSTRUCT)
71
+ # return model, model_answer, model_retrieval
72
 
73
+ # def get_model():
74
+ # global model, model_answer, model_retrieval
75
+ # if model is None:
76
+ # # Khởi tạo model ở đây
77
+ # model, model_answer, model_retrieval = create_model() # Giả sử create_model là hàm tạo model của bạn
78
+ # return model, model_answer, model_retrieval
79
 
80
+ # def extract_keyword_prompt(query):
81
+ # """A prompt that return a JSON block as arguments for querying database"""
82
 
83
+ # prompt = """[INST] SYSTEM: You are an auto chatbot that response with only one action below based on user question.
84
+ # 1. If the guest question is asking about a science topic, you need to respond the information in JSON schema below:
85
+ # {
86
+ # "keywords": [a list of string keywords about the topic],
87
+ # "description": "a paragraph describing the topic in about 50 to 100 words"
88
+ # }
89
+ # 2. If the guest is not asking for any informations or documents, you need to respond in JSON schema below:
90
+ # {
91
+ # "answer": "your answer to the user question"
92
+ # }
93
+ # QUESTION: """ + query + """[/INST]
94
+ # ANSWER: """
95
+ # return prompt
96
 
97
+ # def make_answer_prompt(input, contexts):
98
+ # """A prompt that return the final answer, based on the queried context"""
99
 
100
+ # prompt = (
101
+ # """[INST] You are a library assistant that help answering customer QUESTION based on the INFORMATION given.
102
+ # You always answer in a conversational form naturally and politely.
103
+ # You must introduce all the records given, each must contain title, authors and the link to the pdf file.
104
+ # QUESTION: {input}
105
+ # INFORMATION: '{contexts}'
106
+ # [/INST]
107
+ # ANSWER:
108
+ # """
109
+ # ).format(input=input, contexts=contexts)
110
+ # return prompt
111
 
112
+ # def retrieval_chat_template(question):
113
+ # return {
114
+ # "role":"user",
115
+ # "parts":[f"QUESTION: {question} \n ANSWER:"]
116
+ # }
117
 
118
+ # def answer_chat_template(question, contexts):
119
+ # return {
120
+ # "role":"user",
121
+ # "parts":[f"QUESTION: {question} \n INFORMATION: {contexts} \n ANSWER:"]
122
+ # }
123
 
124
+ # def response(args, db_instance):
125
+ # """Create response context, based on input arguments"""
126
+ # keys = list(dict.keys(args))
127
+ # if "answer" in keys:
128
+ # return args['answer'], None # trả lời trực tiếp
129
 
130
+ # if "keywords" in keys:
131
+ # # perform query
132
+ # query_texts = args["description"]
133
+ # keywords = args["keywords"]
134
+ # results = utils.db.query_relevant(keywords=keywords, query_texts=query_texts)
135
+ # # print(results)
136
+ # ids = results['metadatas'][0]
137
+ # if len(ids) == 0:
138
+ # # go crawl some
139
+ # new_records = utils.crawl_arxiv(keyword_list=keywords, max_results=10)
140
+ # print("Got new records: ",len(new_records))
141
+ # if type(new_records) == str:
142
+ # return "Error occured, information not found", new_records
143
+ # utils.db.add(new_records)
144
+ # db_instance.add(new_records)
145
+ # results = utils.db.query_relevant(keywords=keywords, query_texts=query_texts)
146
+ # ids = results['metadatas'][0]
147
+ # print("Re-queried on chromadb, results: ",ids)
148
+ # paper_id = [id['paper_id'] for id in ids]
149
+ # paper_info = db_instance.query_id(paper_id)
150
+ # print(paper_info)
151
+ # records = [] # get title (2), author (3), link (6)
152
+ # result_string = ""
153
+ # if paper_info:
154
+ # for i in range(len(paper_info)):
155
+ # result_string += "Record no.{} - Title: {}, Author: {}, Link: {}, ".format(i+1,paper_info[i][2],paper_info[i][3],paper_info[i][6])
156
+ # id = paper_info[i][0]
157
+ # selected_document = utils.db.query_exact(id)["documents"]
158
+ # doc_str = "Summary:"
159
+ # for doc in selected_document:
160
+ # doc_str+= doc + " "
161
+ # result_string += doc_str
162
+ # records.append([paper_info[i][2],paper_info[i][3],paper_info[i][6]])
163
+ # return result_string, records
164
+ # else:
165
+ # return "Information not found", "Information not found"
166
+ # # invoke llm and return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
+ # # if "title" in keys:
169
+ # # title = args['title']
170
+ # # authors = utils.authors_str_to_list(args['author'])
171
+ # # paper_info = db_instance.query(title = title,author = authors)
172
+ # # # if query not found then go crawl brh
173
+ # # # print(paper_info)
 
 
 
 
 
 
 
174
 
175
+ # # if len(paper_info) == 0:
176
+ # # new_records = utils.crawl_exact_paper(title=title,author=authors)
177
+ # # print("Got new records: ",len(new_records))
178
+ # # if type(new_records) == str:
179
+ # # # print(new_records)
180
+ # # return "Error occured, information not found", "Information not found"
181
+ # # utils.db.add(new_records)
182
+ # # db_instance.add(new_records)
183
+ # # paper_info = db_instance.query(title = title,author = authors)
184
+ # # print("Re-queried on chromadb, results: ",paper_info)
185
+ # # # -------------------------------------
186
+ # # records = [] # get title (2), author (3), link (6)
187
+ # # result_string = ""
188
+ # # for i in range(len(paper_info)):
189
+ # # result_string += "Title: {}, Author: {}, Link: {}".format(paper_info[i][2],paper_info[i][3],paper_info[i][6])
190
+ # # records.append([paper_info[i][2],paper_info[i][3],paper_info[i][6]])
191
+ # # # process results:
192
+ # # if len(result_string) == 0:
193
+ # # return "Information not found", "Information not found"
194
+ # # return result_string, records
195
+ # # invoke llm and return result
196
 
197
+ # def full_chain_single_question(input_prompt, db_instance):
198
  # try:
199
+ # first_prompt = extract_keyword_prompt(input_prompt)
 
 
200
  # temp_answer = model.generate_content(first_prompt).text
201
 
202
  # args = json.loads(utils.trimming(temp_answer))
203
  # contexts, results = response(args, db_instance)
 
204
  # if not results:
205
+ # # print(contexts)
206
  # return "Random question, direct return", contexts
207
  # else:
208
+ # output_prompt = make_answer_prompt(input_prompt,contexts)
209
+ # answer = model.generate_content(output_prompt).text
 
 
210
  # return temp_answer, answer
211
  # except Exception as e:
212
  # # print(e)
213
  # return temp_answer, "Error occured: " + str(e)
214
+
215
+
216
+ # def format_chat_history_from_web(chat_history: list):
217
+ # temp_chat = []
218
+ # for message in chat_history:
219
+ # temp_chat.append(
220
+ # {
221
+ # "role": message["role"],
222
+ # "parts": [message["content"]]
223
+ # }
224
+ # )
225
+ # return temp_chat
226
+
227
+ # # def full_chain_history_question(chat_history: list, db_instance):
228
+ # # try:
229
+ # # temp_chat = format_chat_history_from_web(chat_history)
230
+ # # print('Extracted temp chat: ',temp_chat)
231
+ # # first_prompt = extract_keyword_prompt(temp_chat[-1]["parts"][0])
232
+ # # temp_answer = model.generate_content(first_prompt).text
233
 
234
+ # # args = json.loads(utils.trimming(temp_answer))
235
+ # # contexts, results = response(args, db_instance)
236
+ # # print('Context extracted: ',contexts)
237
+ # # if not results:
238
+ # # return "Random question, direct return", contexts
239
+ # # else:
240
+ # # QA_Prompt = make_answer_prompt(temp_chat[-1]["parts"][0], contexts)
241
+ # # temp_chat[-1]["parts"] = QA_Prompt
242
+ # # print(temp_chat)
243
+ # # answer = model.generate_content(temp_chat).text
244
+ # # return temp_answer, answer
245
+ # # except Exception as e:
246
+ # # # print(e)
247
+ # # return temp_answer, "Error occured: " + str(e)
248
+
249
+ # def full_chain_history_question(chat_history: list, db_instance):
250
+ # try:
251
+ # temp_chat = format_chat_history_from_web(chat_history)
252
+ # question = temp_chat[-1]['parts'][0]
253
+ # first_answer = model_retrieval.generate_content(temp_chat).text
254
 
255
+ # print(first_answer)
256
+ # args = json.loads(utils.trimming(first_answer))
257
 
258
+ # contexts, results = response(args, db_instance)
259
+ # if not results:
260
+ # return "Random question, direct return", contexts
261
+ # else:
262
+ # print('Context to answers: ',contexts)
263
+ # answer_chat = answer_chat_template(question, contexts)
264
+ # temp_chat[-1] = answer_chat
265
+ # answer = model_answer.generate_content(temp_chat).text
266
+ # return first_answer, answer
267
+ # except Exception as e:
268
+ # if first_answer:
269
+ # return first_answer, "Error occured: " + str(e)
270
+ # else:
271
+ # return "No answer", "Error occured: " + str(e)
chat/model_manage2.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chat.arxiv_bot.arxiv_bot_utils2 as utils
2
+ import google.generativeai as genai
3
+ import json
4
+ import os
5
+ from google.generativeai.types import content_types
6
+ from collections.abc import Iterable
7
+ from IPython import display
8
+ from IPython.display import Markdown
9
+
10
+ # ----------------------- define instructions -----------------------
11
+ system_instruction = """You are a library chatbot that help people to find relevant articles about a topic, or find a specific article with given title and authors.
12
+ Your job is to analyze the user question, generate enough parameters based on the user question and use the tools that are given to you.
13
+ Also, after the function call is done, you must post-process the results in a more conversational form, providing some explanation about the paper based on its summary to avoid recitation.
14
+ You must provide the link to its Arxiv pdf page."""
15
+
16
+ # --------------------------- define tools --------------------------
17
+ def search_for_relevant_article(keywords: list['str'], topic_description: str) -> str:
18
+ """This tool is used to search for articles from the database which is relevant to a topic, using a list of more than 3 keywords and a long sentence topic description.
19
+ If there is not enough 3 keywords from the question, the model must generate more keywords related to the topic.
20
+ If there is no description about the topic, the model must generate a description for the function call.
21
+ \nThe result is a string describe the records found from the database: 'Record no. - Title: <title>, Author: <authors>, Link: <link to the pdf file>, Summary: <summary of the article>'. There can be many records.
22
+ \nIf the result is 'Information not found' it means some error has occured, or the database has no relevant article"""
23
+
24
+ print('Keywords: {}, description: {}'.format(keywords,topic_description))
25
+
26
+ results = utils.ArxivChroma.query_relevant(keywords=keywords, query_texts=topic_description)
27
+ # print(results)
28
+ ids = results['metadatas'][0]
29
+ if len(ids) == 0:
30
+ # go crawl some
31
+ new_records = utils.crawl_arxiv(keyword_list=keywords, max_results=10)
32
+ # print("Got new records: ",len(new_records))
33
+ if type(new_records) == str:
34
+ return "Information not found"
35
+
36
+ utils.ArxivChroma.add(new_records)
37
+ utils.ArxivSQL.add(new_records)
38
+ results = utils.ArxivChroma.query_relevant(keywords=keywords, query_texts=topic_description)
39
+ ids = results['metadatas'][0]
40
+ # print("Re-queried on chromadb, results: ",ids)
41
+
42
+ paper_id = [id['paper_id'] for id in ids]
43
+ paper_info = utils.ArxivSQL.query_id(paper_id)
44
+ # print(paper_info)
45
+ records = [] # get title (2), author (3), link (6)
46
+ result_string = ""
47
+ if paper_info:
48
+ for i in range(len(paper_info)):
49
+ result_string += "Record no.{} - Title: {}, Author: {}, Link: {}, ".format(i+1,paper_info[i][2],paper_info[i][3],paper_info[i][6])
50
+ id = paper_info[i][0]
51
+ selected_document = utils.ArxivChroma.query_exact(id)["documents"]
52
+ doc_str = "Summary:"
53
+ for doc in selected_document:
54
+ doc_str+= doc + " "
55
+ result_string += doc_str
56
+ records.append([paper_info[i][2],paper_info[i][3],paper_info[i][6]])
57
+ return result_string
58
+ else:
59
+ return "Information not found"
60
+
61
+ def search_for_specific_article(title: str, authors: list['str']) -> str:
62
+ """This tool is used to search for a specific article from the database, with its name and authors given.
63
+ \nThe result is a string describe the records found from the database: 'Record no. - Title: <title>, Author: <authors>, Link: <link to the pdf file>, Summary: <summary of the article>'. There can be many records.
64
+ \nIf the result is 'Information not found' it means some error has occured, or the database has no relevant article"""
65
+
66
+ print('Keywords: {}, description: {}'.format(title,authors))
67
+
68
+ paper_info = utils.ArxivSQL.query(title = title,author = authors)
69
+ if len(paper_info) == 0:
70
+ new_records = utils.crawl_exact_paper(title=title,author=authors)
71
+ # print("Got new records: ",len(new_records))
72
+ if type(new_records) == str:
73
+ # print(new_records)
74
+ return "Information not found"
75
+ utils.ArxivChroma.add(new_records)
76
+ utils.ArxivSQL.add(new_records)
77
+ paper_info = utils.ArxivSQL.query(title = title,author = authors)
78
+ # print("Re-queried on chromadb, results: ",paper_info)
79
+ # -------------------------------------
80
+ records = [] # get title (2), author (3), link (6)
81
+ result_string = ""
82
+ if paper_info:
83
+ for i in range(len(paper_info)):
84
+ result_string += "Record no.{} - Title: {}, Author: {}, Link: {}, ".format(i+1,paper_info[i][2],paper_info[i][3],paper_info[i][6])
85
+ id = paper_info[i][0]
86
+ selected_document = utils.ArxivChroma.query_exact(id)["documents"]
87
+ doc_str = "Summary:"
88
+ for doc in selected_document:
89
+ doc_str+= doc + " "
90
+ result_string += doc_str
91
+ records.append([paper_info[i][2],paper_info[i][3],paper_info[i][6]])
92
+ # process results:
93
+ if len(result_string) == 0:
94
+ return "Information not found"
95
+ return result_string
96
+
97
+ def answer_others_questions(question: str) -> str:
98
+ """This tool is the default option for other questions that are not related to article or paper request. The model will response the question with its own answer."""
99
+ return question
100
+
101
+ tools = [search_for_relevant_article, search_for_specific_article, answer_others_questions]
102
+ tools_name = ['search_for_relevant_article', 'search_for_specific_article', 'answer_others_questions']
103
+
104
+ # load key, prepare config ------------------------
105
+ with open("apikey.txt","r") as apikey:
106
+ key = apikey.readline()
107
+ genai.configure(api_key=key)
108
+ generation_config = {
109
+ "temperature": 1,
110
+ "top_p": 1,
111
+ "top_k": 0,
112
+ "max_output_tokens": 2048,
113
+ "response_mime_type": "text/plain",
114
+ }
115
+ safety_settings = [
116
+ {
117
+ "category": "HARM_CATEGORY_DANGEROUS",
118
+ "threshold": "BLOCK_NONE",
119
+ },
120
+ {
121
+ "category": "HARM_CATEGORY_HARASSMENT",
122
+ "threshold": "BLOCK_NONE",
123
+ },
124
+ {
125
+ "category": "HARM_CATEGORY_HATE_SPEECH",
126
+ "threshold": "BLOCK_NONE",
127
+ },
128
+ {
129
+ "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
130
+ "threshold": "BLOCK_NONE",
131
+ },
132
+ {
133
+ "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
134
+ "threshold": "BLOCK_NONE",
135
+ },
136
+ ]
137
+ # this function return a tool_config with mode 'none', 'any', 'auto'
138
+ def tool_config_from_mode(mode: str, fns: Iterable[str] = ()):
139
+ """Create a tool config with the specified function calling mode."""
140
+ return content_types.to_tool_config(
141
+ {"function_calling_config": {"mode": mode, "allowed_function_names": fns}}
142
+ )
143
+
144
+ def init_model(mode = "auto"):
145
+ # return an instance of a model, holding its own ChatSession
146
+ # every socket session holds its own model
147
+ # this function must be called upon socket init, also start_chat() to begin chat
148
+ model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest",
149
+ safety_settings=safety_settings,
150
+ generation_config=generation_config,
151
+ tools=tools,
152
+ tool_config=tool_config_from_mode(mode),
153
+ system_instruction=system_instruction)
154
+ chat_instance = model.start_chat(enable_automatic_function_calling=True)
155
+ return model, chat_instance
156
+
157
+ # handle tool call and chatsession
158
+ def full_chain_history_question(user_input, chat_instance: genai.ChatSession, mode="auto"):
159
+ try:
160
+ response = chat_instance.send_message(user_input,tool_config=tool_config_from_mode(mode)).text
161
+ return response, chat_instance.history
162
+ except Exception as e:
163
+ print(e)
164
+ return f'Error occured during call: {e}', chat_instance.history
165
+
166
+ # for printing log session
167
+ def print_history(history):
168
+ for content in history:
169
+ part = content.parts[0]
170
+ print(content.role, "->", type(part).to_dict(part))
171
+ print('-'*80)
172
+
173
+ utils.ArxivChroma.connect()
174
+ utils.ArxivSQL.connect()
chatbot_django/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (170 Bytes). View file
 
chatbot_django/__pycache__/asgi.cpython-311.pyc ADDED
Binary file (1.24 kB). View file
 
chatbot_django/__pycache__/settings.cpython-311.pyc ADDED
Binary file (2.7 kB). View file
 
chatbot_django/__pycache__/urls.cpython-311.pyc ADDED
Binary file (1.25 kB). View file
 
concat.txt ADDED
Binary file (32.7 kB). View file
 
db.sqlite3 CHANGED
Binary files a/db.sqlite3 and b/db.sqlite3 differ
 
models/models--jinaai--jina-bert-implementation/blobs/64b6ce6fe4477c320b0ab303e2f26ae98beae1f7 ADDED
The diff for this file is too large to render. See raw diff