Spaces:

LittleLittleCloud
/

mlnet-samples

Runtime error

App Files Files Community

XiaoYun Zhang commited on Oct 13, 2023

Commit

6abb254

•

0 Parent(s):

update

Browse files

Files changed (17) hide show

.docker-compose +0 -0
.gitattributes +35 -0
.gitignore +18 -0
.local_storage/user.json +14 -0
LICENSE +21 -0
README.md +13 -0
app.py +302 -0
di.py +51 -0
embedding.py +56 -0
index.py +207 -0
model/document.py +98 -0
model/record.py +8 -0
model/user.py +13 -0
requirements.txt +4 -0
setting.py +18 -0
setup.py +13 -0
storage.py +48 -0

.docker-compose ADDED Viewed

File without changes

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,18 @@

+# .gitignore file for Python projects
+# Covers most common project files and folders
+.DS_Store
+*.pyc
+*.pyo
+*.pyd
+__pycache__
+*.so
+*.egg
+*.egg-info
+dist
+build
+docs/_build
+.idea
+venv
+test
+.env

.local_storage/user.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+    "user_name": "bigmiao",
+    "email": "[email protected]",
+    "full_name": "g2260578356",
+    "disabled": false,
+    "documents": [
+        {
+            "name": "mlnet_notebook_examples_v1.json.json",
+            "description": null,
+            "status": "done",
+            "url": "bigmiao-mlnet_examples.json"
+        }
+    ]
+}

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Xiaoyun Zhang
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Mlnet Samples
+emoji: 😻
+colorFrom: yellow
+colorTo: indigo
+sdk: gradio
+sdk_version: 3.47.1
+app_file: app.py
+pinned: false
+license: mit
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,302 @@

+import fastapi as api
+from typing import Annotated
+from fastapi.security import OAuth2PasswordBearer, OAuth2AuthorizationCodeBearer, OAuth2PasswordRequestForm
+from model.document import Document, PlainTextDocument, JsonDocument
+import sys
+from model.user import User
+from fastapi import FastAPI, File, UploadFile
+from di import initialize_di_for_app
+import gradio as gr
+import os
+import json
+SETTINGS, STORAGE, EMBEDDING, INDEX = initialize_di_for_app()
+user_json_str = STORAGE.load('user.json')
+USER = User.parse_raw(user_json_str)
+oauth2_scheme  = OAuth2PasswordBearer(tokenUrl="/api/v1/auth/token")
+app = api.FastAPI()
+app.openapi_version = "3.0.0"
+users = [USER]
+async def get_current_user(token: str = api.Depends(oauth2_scheme)):
+    '''
+    Get current user
+    '''
+    for user in users:
+        if user.user_name == token:
+            return user
+    raise api.HTTPException(status_code=401, detail="Invalid authentication credentials")
+@app.post("/api/v1/auth/token")
+async def login(form_data: Annotated[OAuth2PasswordRequestForm, api.Depends()]):
+    '''
+    Login to get a token
+    '''
+    return {"access_token": form_data.username}
+@app.post("/api/v1/uploadfile/", include_in_schema=False)
+def create_upload_file(file: UploadFile = api.File(...)) -> Document:
+    '''
+    Upload a file
+    '''
+    fileUrl = f'{USER.user_name}-{file.filename}'
+    STORAGE.save(fileUrl, file.read())
+    # create plainTextDocument if the file is a text file
+    if file.filename.endswith('.txt'):
+        return PlainTextDocument(
+            name=file.filename,
+            status='uploading',
+            url=fileUrl,
+            embedding=EMBEDDING,
+            storage=STORAGE,
+        )
+    else:
+        raise api.HTTPException(status_code=400, detail="File type not supported")
+### /api/v1/.well-known
+#### Get /openapi.json
+# Get the openapi json file
+@app.get("/api/v1/.well-known/openapi.json")
+async def get_openapi():
+    '''
+    otherwise return 401
+    '''
+    # get a list of document names + description
+    document_list = [[doc.name, doc.description] for doc in USER.documents]
+    # get openapi json from api
+    openapi = app.openapi().copy()
+    openapi['info']['title'] = 'DocumentSearch'
+    description = f'''Search documents with a query.
+    ## Documents
+    {document_list}
+    '''
+    openapi['info']['description'] = description
+    # update description in /api/v1/search
+    openapi['paths']['/api/v1/search']['get']['description'] += f'''
+Available documents:
+{document_list}
+'''
+    # filter out unnecessary endpoints
+    openapi['paths'] = {
+        '/api/v1/search': openapi['paths']['/api/v1/search'],
+    }
+    # remove components
+    openapi['components'] = {}
+    # return the openapi json
+    return openapi
+### /api/v1/document
+#### Get /list
+# Get the list of documents
+@app.get("/api/v1/document/list")
+# async def get_document_list(user: Annotated[User, api.Depends(get_current_user)]) -> list[Document]:
+async def get_document_list() -> list[Document]:
+    '''
+    Get the list of documents
+    '''
+    return USER.documents
+#### Post /upload
+# Upload a document
+@app.post("/api/v1/document/upload")
+# def upload_document(user: Annotated[User, api.Depends(get_current_user)], document: Annotated[Document, api.Depends(create_upload_file)]):
+def upload_document(document: Annotated[Document, api.Depends(create_upload_file)]):
+    '''
+    Upload a document
+    '''
+    document.status = 'processing'
+    INDEX.load_or_update_document(user, document, progress)
+    document.status = 'done'
+    USER.documents.append(document)
+#### Get /delete
+# Delete a document
+@app.get("/api/v1/document/delete")
+# async def delete_document(user: Annotated[User, api.Depends(get_current_user)], document_name: str):
+async def delete_document(document_name: str):
+    '''
+    Delete a document
+    '''
+    for doc in USER.documents:
+        if doc.name == document_name:
+            STORAGE.delete(doc.url)
+            INDEX.remove_document(USER, doc)
+            USER.documents.remove(doc)
+            return
+    raise api.HTTPException(status_code=404, detail="Document not found")
+# Query the index
+@app.get("/api/v1/search", operation_id=None,)
+def search(
+    # user: Annotated[User, api.Depends(get_current_user)],
+    query: str,
+    document_name: str = None,
+    top_k: int = 10,
+    threshold: float = 0.5):
+    '''
+    Search documents with a query. It will return [top_k] results with a score higher than [threshold].
+    query: the query string, required
+    document_name: the document name, optional. You can provide this parameter to search in a specific document.
+    top_k: the number of results to return, optional. Default to 10.
+    threshold: the threshold of the results, optional. Default to 0.5.
+    '''
+    if document_name:
+        for doc in USER.documents:
+            if doc.name == document_name:
+                return INDEX.query_document(USER, doc, query, top_k, threshold)
+        raise api.HTTPException(status_code=404, detail="Document not found")
+    else:
+        return INDEX.query_index(USER, query, top_k, threshold)
+def receive_signal(signalNumber, frame):
+    print('Received:', signalNumber)
+    sys.exit()
+@app.on_event("startup")
+async def startup_event():
+    import signal
+    signal.signal(signal.SIGINT, receive_signal)
+    # startup tasks
+@app.on_event("shutdown")
+def exit_event():
+    # save USER
+    STORAGE.save('user.json', USER.model_dump_json())
+    print('exit')
+user = USER
+def gradio_upload_document(file: File):
+    file_temp_path = file.name
+    # load file
+    file_name = os.path.basename(file_temp_path)
+    fileUrl = f'{USER.user_name}-{file_name}'
+    with open(file_temp_path, 'r', encoding='utf-8') as f:
+        STORAGE.save(fileUrl, f.read())
+    # create plainTextDocument if the file is a text file
+    doc = None
+    if file_name.endswith('.txt'):
+        doc = PlainTextDocument(
+            name=file_name,
+            status='uploading',
+            url=fileUrl,
+            embedding=EMBEDDING,
+            storage=STORAGE,
+        )
+    elif file_name.endswith('.json'):
+        doc = JsonDocument(
+            name=file_name,
+            status='uploading',
+            url=fileUrl,
+            embedding=EMBEDDING,
+            storage=STORAGE,
+        )
+    else:
+        raise api.HTTPException(status_code=400, detail="File type not supported")
+    doc.status = 'processing'
+    INDEX.load_or_update_document(user, doc)
+    doc.status = 'done'
+    USER.documents.append(doc)
+    return f'uploaded {file_name}'
+def gradio_query(query: str, document_name: str = None, top_k: int = 10, threshold: float = 0.5):
+    res_or_exception = search(query, document_name, top_k, threshold)
+    if isinstance(res_or_exception, api.HTTPException):
+        raise res_or_exception
+    # convert to json string
+    records = [record.model_dump(mode='json') for record in res_or_exception]
+    return json.dumps(records, indent=4)
+with gr.Blocks() as ui:
+    gr.Markdown("#llm-memory")
+    with gr.Column():
+        gr.Markdown(
+        """
+        ## LLM Memory
+        """)
+        with gr.Row():
+            user_name = gr.Label(label="User name", value=USER.user_name)
+            # url to .well-known/openapi.json
+            gr.Label(label=".wellknown/openapi.json", value=f"/api/v1/.well-known/openapi.json")
+        # with gr.Tab("avaiable documents"):
+        #     available_documents = gr.Label(label="avaiable documents", value="avaiable documents")
+        #     refresh_btn = gr.Button(label="refresh", type="button")
+        #     refresh_btn.click(lambda: '\r\n'.join([doc.name for doc in USER.documents]), None, available_documents)
+        #     documents = USER.documents
+        #     for document in documents:
+        #         gr.Label(label=document.name, value=document.name)
+        # with gr.Tab("upload document"):
+        #     with gr.Tab("upload .txt document"):
+        #         file = gr.File(label="upload document", type="file", file_types=[".txt"])
+        #         output = gr.Label(label="output", value="output")
+        #         upload_btn = gr.Button("upload document", type="button")
+        #         upload_btn.click(gradio_upload_document, file, output)
+        #     with gr.Tab("upload .json document"):
+        #         gr.Markdown(
+        #         """
+        #         The json document should be a list of objects, each object should have a `content` field. If you want to add more fields, you can add them in the `meta_data` field.
+        #         For example:
+        #         ```json
+        #         [
+        #             {
+        #                 "content": "hello world",
+        #                 "meta_data": {
+        #                     "title": "hello world",
+        #                     "author": "llm-memory"
+        #                 }
+        #             },
+        #             {
+        #                 "content": "hello world"
+        #                 "meta_data": {
+        #                     "title": "hello world",
+        #                     "author": "llm-memory"
+        #                 }
+        #             }
+        #         ]
+        #         ```
+        #         ## Note
+        #         - The `meta_data` should be a dict which both keys and values are strings.
+        #         """)
+        #         file = gr.File(label="upload document", type="file", file_types=[".json"])
+        #         output = gr.Label(label="output", value="output")
+        #         upload_btn = gr.Button("upload document", type="button")
+        #         upload_btn.click(gradio_upload_document, file, output)
+        with gr.Tab("search"):
+            query = gr.Textbox(label="search", placeholder="Query")
+            document = gr.Dropdown(label="document", choices=[None] + [doc.name for doc in USER.documents], placeholder="document, optional")
+            top_k = gr.Number(label="top_k", placeholder="top_k, optional", value=10)
+            threshold = gr.Number(label="threshold", placeholder="threshold, optional", value=0.5)
+            output = gr.Code(label="output", language="json", value="output")
+            query_btn = gr.Button("Query")
+            query_btn.click(gradio_query, [query, document, top_k, threshold], output, api_name="search")
+gradio_app = gr.routes.App.create_app(ui)
+app.mount("/", gradio_app)
+ui.launch()

di.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from storage import LocalStorage, Storage
+from setting import Settings
+from embedding import AzureOpenAITextAda002, Embedding, OpenAITextAda002
+from index import Index, QDrantVectorStore
+from model.user import User
+from qdrant_client import QdrantClient
+def initialize_di_for_test() -> tuple[Settings, Storage,Embedding,Index]:
+    SETTINGS = Settings(_env_file='./test/.env.test')
+    STORAGE = LocalStorage('./test/test_storage')
+    if SETTINGS.embedding_use_azure:
+        EMBEDDING = AzureOpenAITextAda002(
+            api_base=SETTINGS.embedding_azure_openai_api_base,
+            model_name=SETTINGS.embedding_azure_openai_model_name,
+            api_key=SETTINGS.embedding_azure_openai_api_key,
+        )
+    else:
+        EMBEDDING = OpenAITextAda002(SETTINGS.openai_api_key)
+    INDEX = QDrantVectorStore(
+        embedding=EMBEDDING,
+        client= QdrantClient(
+            url=SETTINGS.qdrant_url,
+            api_key=SETTINGS.qdrant_api_key,),
+        collection_name='test_collection',
+    )
+    INDEX.create_collection_if_not_exists()
+    return SETTINGS, STORAGE, EMBEDDING, INDEX
+def initialize_di_for_app() -> tuple[Settings, Storage,Embedding,Index]:
+    SETTINGS = Settings(_env_file='.env')
+    STORAGE = LocalStorage('.local_storage')
+    if SETTINGS.embedding_use_azure:
+        EMBEDDING = AzureOpenAITextAda002(
+            api_base=SETTINGS.embedding_azure_openai_api_base,
+            model_name=SETTINGS.embedding_azure_openai_model_name,
+            api_key=SETTINGS.embedding_azure_openai_api_key,
+        )
+    else:
+        EMBEDDING = OpenAITextAda002(SETTINGS.openai_api_key)
+    INDEX = QDrantVectorStore(
+        embedding=EMBEDDING,
+        client= QdrantClient(
+            url=SETTINGS.qdrant_url,
+            api_key=SETTINGS.qdrant_api_key,),
+        collection_name='collection',
+    )
+    return SETTINGS, STORAGE, EMBEDDING, INDEX

embedding.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import openai
+class Embedding:
+    type: str|None = None
+    vector_size: int|None = None
+    def generate_embedding(self, content: str) -> list[float]:
+        pass
+class OpenAITextAda002(Embedding):
+    type: str = 'text-ada-002'
+    vector_size: int = 1536
+    api_key: str = None
+    def __init__(self, api_key: str):
+        self.api_key = api_key
+    def generate_embedding(self, content: str) -> list[float]:
+        # replace newline with space
+        content = content.replace('\n', ' ')
+        # limit to 8192 characters
+        content = content[:6000]
+        return openai.Embedding.create(
+            api_key=self.api_key,
+            api_type='openai',
+            input = content,
+            model="text-embedding-ada-002"
+        )["data"][0]["embedding"]
+class AzureOpenAITextAda002(Embedding):
+    type: str = 'text-ada-002'
+    vector_size: int = 1536
+    api_key: str = None
+    def __init__(
+            self,
+            api_base: str,
+            model_name: str,
+            api_key: str):
+        self.api_key = api_key
+        self.model_name = model_name
+        self.api_key = api_key
+        self.api_base = api_base
+    def generate_embedding(self, content: str) -> list[float]:
+        # replace newline with space
+        content = content.replace('\n', ' ')
+        # limit to 8192 characters
+        content = content[:6000]
+        return openai.Embedding.create(
+            api_key=self.api_key,
+            api_type='azure',
+            api_base=self.api_base,
+            input = content,
+            engine=self.model_name,
+            api_version="2023-07-01-preview"
+        )["data"][0]["embedding"]

index.py ADDED Viewed

	@@ -0,0 +1,207 @@

+from qdrant_client import QdrantClient
+from qdrant_client.http.models import ScoredPoint
+from embedding import Embedding
+from model.document import Document
+from model.record import Record
+from model.user import User
+from qdrant_client.http import models
+import uuid
+import tqdm
+class Index:
+    type: str
+    def load_or_update_document(self, user: User, document: Document, progress: tqdm.tqdm = None):
+        pass
+    def remove_document(self, user: User, document: Document):
+        pass
+    def query_index(self, user: User, query: str, top_k: int = 10, threshold: float = 0.5) -> list[Record]:
+        pass
+    def query_document(self, user: User, document: Document, query: str, top_k: int = 10, threshold: float = 0.5) -> list[Record]:
+        pass
+    def contains(self, user: User, document: Document) -> bool:
+        pass
+class QDrantVectorStore(Index):
+    _client: QdrantClient
+    _embedding: Embedding
+    collection_name: str
+    batch_size: int = 10
+    type: str = 'qdrant'
+    def __init__(
+            self,
+            client: QdrantClient,
+            embedding: Embedding,
+            collection_name: str):
+        self._embedding = embedding
+        self.collection_name = collection_name
+        self._client = client
+    def _response_to_records(self, response: list[ScoredPoint]) -> list[Record]:
+        for point in response:
+            meta_data = point.payload['meta_data']
+            yield Record(
+                embedding=point.vector,
+                meta_data= meta_data,
+                content=point.payload['content'],
+                document_id=point.payload['document_id'],
+                timestamp=point.payload['timestamp'],
+            )
+    def create_collection(self):
+        self._client.recreate_collection(
+            collection_name=self.collection_name,
+            vectors_config=models.VectorParams(
+                size=self._embedding.vector_size,
+                distance=models.Distance.COSINE),
+        )
+    def if_collection_exists(self) -> bool:
+        try:
+            self._client.get_collection(self.collection_name)
+            return True
+        except Exception:
+            return False
+    def create_collection_if_not_exists(self):
+        if not self.if_collection_exists():
+            self.create_collection()
+    def load_or_update_document(self, user: User, document: Document, progress: tqdm.tqdm = None):
+        self.create_collection_if_not_exists()
+        if self.contains(user, document):
+            self.remove_document(user, document)
+        group_id = user.user_name
+        # upsert records in batch
+        records = document.load_records()
+        records = list(records)
+        batch_range = range(0, len(records), self.batch_size)
+        if progress is not None:
+            batch_range = progress(batch_range)
+        for i in batch_range:
+            batch = records[i:i+self.batch_size]
+            uuids = [str(uuid.uuid4()) for _ in batch]
+            payloads = [{
+                'content': record.content,
+                'meta_data': record.meta_data,
+                'document_id': record.document_id,
+                'group_id': group_id,
+                'timestamp': record.timestamp,
+            } for record in batch]
+            vectors = [record.embedding for record in batch]
+            self._client.upsert(
+                collection_name=self.collection_name,
+                points=models.Batch(
+                    payloads=payloads,
+                    ids=uuids,
+                    vectors=vectors,
+                ),
+            )
+    def remove_document(self, user: User, document: Document):
+        if not self.if_collection_exists():
+            return
+        document_id = document.name
+        self._client.delete(
+            collection_name=self.collection_name,
+            points_selector=models.FilterSelector(
+                filter=models.Filter(
+                    must=[
+                        models.FieldCondition(
+                            key="document_id",
+                            match=models.MatchValue(value=document_id)
+                        ),
+                        models.FieldCondition(
+                            key="group_id",
+                            match=models.MatchValue(
+                            value=user.user_name,
+                            ),
+                        )
+                    ]
+                )
+            )
+        )
+    def contains(self, user: User, document: Document) -> bool:
+        document_id = document.name
+        group_id = user.user_name
+        count = self._client.count(
+            collection_name=self.collection_name,
+            count_filter=models.Filter(
+                must=[
+                    models.FieldCondition(
+                        key="document_id",
+                        match=models.MatchValue(value=document_id)
+                    ),
+                    models.FieldCondition(
+                        key="group_id",
+                        match=models.MatchValue(
+                        value=group_id,
+                        ),
+                    )
+                ]
+            ),
+            exact=True,
+        )
+        return count.count > 0
+    def query_index(self, user: User, query: str, top_k: int = 10, threshold: float = 0.5) -> list[Record]:
+        if not self.if_collection_exists():
+            return []
+        response = self._client.search(
+            collection_name=self.collection_name,
+            query_vector=self._embedding.generate_embedding(query),
+            limit=top_k,
+            query_filter= models.Filter(
+                must=[
+                    models.FieldCondition(
+                        key="group_id",
+                        match=models.MatchValue(
+                        value=user.user_name,
+                        ),
+                    )
+                ]
+            ),
+            score_threshold=threshold,
+        )
+        return self._response_to_records(response)
+    def query_document(self, user: User, document: Document, query: str, top_k: int = 10, threshold: float = 0.5) -> list[Record]:
+        if not self.if_collection_exists():
+            return []
+        response = self._client.search(
+            collection_name=self.collection_name,
+            query_vector=self._embedding.generate_embedding(query),
+            limit=top_k,
+            query_filter= models.Filter(
+                must=[
+                    models.FieldCondition(
+                        key="document_id",
+                        match=models.MatchValue(value=document.name)
+                    ),
+                    models.FieldCondition(
+                        key="group_id",
+                        match=models.MatchValue(value=user.user_name),
+                    )
+                ]
+            ),
+            score_threshold=threshold,
+        )
+        return self._response_to_records(response)

model/document.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from pydantic import BaseModel
+from .record import Record
+from storage import Storage
+from embedding import Embedding
+import time
+import json
+class Document(BaseModel):
+    name: str
+    description: str | None = None
+    status: str = 'uploading' # uploading, processing, done, failed
+    url: str | None = None
+    _embedding: Embedding
+    _storage: Storage
+    def load_records(self) -> list[Record]:
+        pass
+class PlainTextDocument(Document):
+    def __init__(
+            self,
+            embedding: Embedding,
+            storage: Storage,
+            **kwargs):
+        super().__init__(**kwargs)
+        self._embedding = embedding
+        self._storage = storage
+    def _enhance_line(self, line: str) -> str:
+        return line
+    def load_records(self) -> list[Record]:
+        str = self._storage.load(self.url)
+        lines = str.split('\n')
+        for i, line in enumerate(lines):
+            # remove empty lines
+            if len(line.strip()) == 0:
+                continue
+            enhance_line = self._enhance_line(line)
+            embedding = self._embedding.generate_embedding(enhance_line)
+            embedding_type = self._embedding.type
+            meta_data = {
+                'embedding_type': embedding_type,
+                'document_id': self.name,
+                'line_number': i,
+                'source': line,
+            }
+            yield Record(
+                embedding=embedding,
+                meta_data=meta_data,
+                content=line,
+                document_id=self.name,
+                timestamp=int(time.time()))
+class JsonDocument(Document):
+    def __init__(
+            self,
+            embedding: Embedding,
+            storage: Storage,
+            **kwargs):
+        super().__init__(**kwargs)
+        self._embedding = embedding
+        self._storage = storage
+    def load_records(self) -> list[Record]:
+        '''
+        json format:
+        {
+            'content': str // the content of the record
+            'meta_data': dict // the meta data of the record
+        }
+        '''
+        str = self._storage.load(self.url)
+        records = json.loads(str)
+        for i, item in enumerate(records):
+            # sleep 300ms
+            time.sleep(0.3)
+            embedding = self._embedding.generate_embedding(item['content'])
+            embedding_type = self._embedding.type
+            meta_data = {
+                'embedding_type': embedding_type,
+                'document_id': self.name,
+                'line_number': i,
+                'source': item['content'],
+            }
+            if 'meta_data' in item:
+                # merge meta data
+                meta_data = {**item['meta_data'], **meta_data}
+            yield Record(
+                embedding=embedding,
+                meta_data=meta_data,
+                content=item['content'],
+                document_id=self.name,
+                timestamp=int(time.time()))

model/record.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from pydantic import BaseModel
+class Record(BaseModel):
+    content: str
+    embedding: list[float] | None = None
+    document_id: str | None = None
+    meta_data: dict | None = None
+    timestamp: int | None = None

model/user.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from pydantic import BaseModel
+from .document import Document
+class User(BaseModel):
+    user_name: str
+    email: str
+    full_name: str
+    disabled: bool = None
+    documents: list[Document] = None

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+fastapi[all]==0.103.1
+openai==0.28.0
+python-dotenv==1.0.0
+qdrant-client==1.5.2

setting.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from pydantic_settings import BaseSettings, SettingsConfigDict
+class Settings(BaseSettings):
+    openai_api_key: str | None = None
+    azure_openai_api_key: str | None = None
+    qdrant_api_key: str | None = None
+    qdrant_url: str | None = None
+    qdrant_host: str | None = None
+    qdrant_port: int | None = None
+    # embedding setting
+    embedding_use_azure: bool = False
+    embedding_azure_openai_api_base: str | None = None
+    embedding_azure_openai_model_name: str | None = None
+    embedding_azure_openai_api_key: str | None = None
+    model_config = SettingsConfigDict(env_file='.env', env_file_encoding='utf-8')

setup.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# setup
+from setuptools import setup
+setup(
+    name='llm_memory',
+    version='1.0',
+    author='LittleLittleCloud',
+    python_requires='>=3.7, <4',
+    install_requires=[
+        'fastapi[all]==0.103.1',
+        'openai==0.28.0',
+        'python-dotenv==1.0.0',
+        'qdrant-client==1.5.2',
+    ])

storage.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import os
+class Storage:
+    def save(self, filename, data):
+        '''
+        Save or update a file
+        '''
+        pass
+    def delete(self, filename):
+        '''
+        Delete a file
+        '''
+        pass
+    def load(self, filename)->str:
+        '''
+        Load a file
+        '''
+        pass
+    def list(self)->list[str]:
+        '''
+        List all files
+        '''
+        pass
+class LocalStorage(Storage):
+    def __init__(self, root):
+        if not os.path.exists(root):
+            os.makedirs(root)
+        self.root = root
+    def save(self, filename, data):
+        with open(os.path.join(self.root, filename), 'w', encoding='utf-8') as f:
+            f.write(data)
+    def delete(self, filename):
+        os.remove(os.path.join(self.root, filename))
+    def load(self, filename):
+        with open(os.path.join(self.root, filename), 'r', encoding='utf-8') as f:
+            return f.read()
+    def list(self):
+        return os.listdir(self.root)