Tsunnami commited on
Commit
2bceb6f
1 Parent(s): e9a4955

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +129 -1
README.md CHANGED
@@ -409,4 +409,132 @@ topic_model.get_topic_info()
409
  |367|10|367_opioid_morphine_pain_nefopam|opioid,morphine,pain,nefopam,us,epidural,postoperative,intrathecal,analgesia,anesthesia|
410
  |368|10|368_lps_macrophages_sepsis_mgmt|lps,macrophages,sepsis,mgmt,mice,cgas,bam15,ezh2,clp,null|
411
 
412
- </details>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
  |367|10|367_opioid_morphine_pain_nefopam|opioid,morphine,pain,nefopam,us,epidural,postoperative,intrathecal,analgesia,anesthesia|
410
  |368|10|368_lps_macrophages_sepsis_mgmt|lps,macrophages,sepsis,mgmt,mice,cgas,bam15,ezh2,clp,null|
411
 
412
+ </details>
413
+
414
+ ## Training Procedure
415
+
416
+ The model was trained as follows:
417
+
418
+ ```py
419
+ from bertopic import BERTopic
420
+
421
+ from sentence_transformers import SentenceTransformer
422
+
423
+ from umap import UMAP
424
+ from hdbscan import HDBSCAN
425
+ from sklearn.feature_extraction.text import CountVectorizer
426
+ from bertopic.representation import PartOfSpeech, KeyBERTInspired, MaximalMarginalRelevance, ZeroShotClassification
427
+
428
+ embedding_model = SentenceTransformer("all-mpnet-base-v2")
429
+ umap_model = UMAP(n_neighbors=25, n_components=5, min_dist=0.0, metric='cosine', random_state=42, verbose=True) #change n_neightbor, n_components, metric
430
+ hdbscan_model = HDBSCAN(min_cluster_size=20, min_samples=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True) #change min_cluster_size, min_samples
431
+ vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3), min_df=5)
432
+
433
+
434
+ representation_models = {
435
+ "POS": PartOfSpeech("en_core_web_lg"),
436
+ "KeyBERTInspired": KeyBERTInspired(),
437
+ "MMR": MaximalMarginalRelevance(diversity=0.3),
438
+ "KeyBERT + MMR": [KeyBERTInspired(), MaximalMarginalRelevance(diversity=0.3)],
439
+ "Summarization": summarization, # Own Prompted Model as to Summarize.
440
+ }
441
+
442
+ topic_model = BERTopic(
443
+ language="english",
444
+ embedding_model=embedding_model,
445
+ umap_model=umap_model,
446
+ #hdbscan_model=hdbscan_model,
447
+ #vectorizer_model=vectorizer_model,
448
+ representation_model=representation_models,
449
+ verbose=True,
450
+ )
451
+ topics, probs = topic_model.fit_transform(docs)
452
+ ```
453
+
454
+ ## Create Own Representation Model
455
+
456
+ Using [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct) for its lightweightness
457
+
458
+ ### Defined Summarization
459
+
460
+ ```py
461
+ import torch
462
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
463
+
464
+ torch.random.manual_seed(42)
465
+
466
+ summarization_model = AutoModelForCausalLM.from_pretrained(
467
+ "microsoft/Phi-3-mini-128k-instruct",
468
+ device_map="cuda",
469
+ torch_dtype="auto",
470
+ trust_remote_code=True,
471
+ )
472
+ summarization_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
473
+
474
+ def summarize_with_model(text):
475
+ question = f"""
476
+ I have a document of which abstract and title are given.
477
+ The following documents are a small but representative subset of all documents in the topic:
478
+ {text}
479
+
480
+ Based on the information above, please give a description topic in the following keyword format:
481
+ topic: <description>
482
+ """
483
+ messages = [
484
+ {"role": "user", "content": question},
485
+ ]
486
+ pipe = pipeline(
487
+ "text-generation",
488
+ model=summarization_model,
489
+ tokenizer=summarization_tokenizer,
490
+ )
491
+ generation_args = {
492
+ "max_new_tokens": 128,
493
+ "return_full_text": False,
494
+ "temperature": 0.0,
495
+ "do_sample": False,
496
+ }
497
+ output = pipe(messages, **generation_args)
498
+ return output[0]['generated_text']
499
+
500
+ ```
501
+
502
+ Prompt Used,
503
+
504
+ ```py
505
+ question = f"""
506
+ I have a document of which abstract and title are given.
507
+ The following documents are a small but representative subset of all documents in the topic:
508
+ {text}
509
+
510
+ Based on the information above, please give a description topic in the following keyword format:
511
+ topic: <description>
512
+ """
513
+ ```
514
+
515
+ **NOTE: Persuation with other better propmt is recommended**
516
+
517
+ ### Mounted on Base-Representation
518
+
519
+ ```py
520
+ from bertopic.representation._base import BaseRepresentation
521
+ from typing import List, Mapping, Tuple
522
+
523
+ class SummarizationRepresentation(BaseRepresentation):
524
+ def __init__(self, summarization_model, summarization_tokenizer):
525
+ self.summarization_model = summarization_model
526
+ self.summarization_tokenizer = summarization_tokenizer
527
+
528
+ def extract_topics(self, topic_model, documents, c_tf_idf, topics
529
+ ) -> Mapping[str, List[Tuple[str, float]]]:
530
+ updated_topics = {}
531
+ for topic_id, words in topics.items():
532
+ # Extract only the words from the tuples
533
+ words_only = [word[0] for word in words]
534
+ text = " ".join(words_only)
535
+ summary = summarize_with_model(text)
536
+ updated_topics[topic_id] = [(summary, 1.0)]
537
+ return updated_topics
538
+
539
+ summarization = SummarizationRepresentation(summarization_model, summarization_tokenizer)
540
+ ```