From c5c8b4b16a7127558df8c1fe4559bcddd5a1ad58 Mon Sep 17 00:00:00 2001 From: Matt Williams Date: Tue, 17 Oct 2023 16:41:28 -0700 Subject: [PATCH] added python rag news summary Signed-off-by: Matt Williams --- examples/python-rag-newssummary/README.md | 22 ++++ .../python-rag-newssummary/requirements.txt | 9 ++ examples/python-rag-newssummary/summ.py | 86 ++++++++++++++ examples/python-rag-newssummary/utils.py | 108 ++++++++++++++++++ 4 files changed, 225 insertions(+) create mode 100644 examples/python-rag-newssummary/README.md create mode 100644 examples/python-rag-newssummary/requirements.txt create mode 100644 examples/python-rag-newssummary/summ.py create mode 100644 examples/python-rag-newssummary/utils.py diff --git a/examples/python-rag-newssummary/README.md b/examples/python-rag-newssummary/README.md new file mode 100644 index 00000000..fd749b2b --- /dev/null +++ b/examples/python-rag-newssummary/README.md @@ -0,0 +1,22 @@ +# News Summarizer + +This example goes through a series of steps: + + 1. You choose a topic area (e.g., "news", "NVidia", "music", etc.). + 2. Gets the most recent articles on that topic from various sources. + 3. Uses Ollama to summarize each article. + 4. Creates chunks of sentences from each article. + 5. Uses Sentence Transformers to generate embeddings for each of those chunks. + 6. You enter a question regarding the summaries shown. + 7. Uses Sentence Transformers to generate an embedding for that question. + 8. Uses the embedded question to find the most similar chunks. + 9. Feeds all that to Ollama to generate a good answer to your question based on these news articles. + +This example lets you pick from a few different topic areas, then summarize the most recent x articles for that topic. It then creates chunks of sentences from each article and then generates embeddings for each of those chunks. + +You can run the example like this: + +```bash +python3 -m pip install -r requirements.txt +python3 summ.py +``` diff --git a/examples/python-rag-newssummary/requirements.txt b/examples/python-rag-newssummary/requirements.txt new file mode 100644 index 00000000..1a92729a --- /dev/null +++ b/examples/python-rag-newssummary/requirements.txt @@ -0,0 +1,9 @@ +beautifulsoup4==4.12.2 +feedparser==6.0.10 +mattsollamatools==0.0.8 +newspaper3k==0.2.8 +nltk==3.8.1 +numpy==1.24.3 +Requests==2.31.0 +scikit_learn==1.3.0 +sentence_transformers==2.2.2 diff --git a/examples/python-rag-newssummary/summ.py b/examples/python-rag-newssummary/summ.py new file mode 100644 index 00000000..4993cfca --- /dev/null +++ b/examples/python-rag-newssummary/summ.py @@ -0,0 +1,86 @@ +import curses +import json +from utils import get_url_for_topic, topic_urls, menu, getUrls, get_summary, getArticleText, knn_search +import requests +from sentence_transformers import SentenceTransformer +from mattsollamatools import chunker + +if __name__ == "__main__": + chosen_topic = curses.wrapper(menu) + print("Here is your news summary:\n") + urls = getUrls(chosen_topic, n=5) + model = SentenceTransformer('all-MiniLM-L6-v2') + allEmbeddings = [] + + for url in urls: + article={} + article['embeddings'] = [] + article['url'] = url + text = getArticleText(url) + summary = get_summary(text) + chunks = chunker(text) # Use the chunk_text function from web_utils + embeddings = model.encode(chunks) + for (chunk, embedding) in zip(chunks, embeddings): + item = {} + item['source'] = chunk + item['embedding'] = embedding.tolist() # Convert NumPy array to list + item['sourcelength'] = len(chunk) + article['embeddings'].append(item) + + allEmbeddings.append(article) + + print(f"{summary}\n") + + + while True: + context = [] + # Input a question from the user + question = input("Enter your question about the news, or type quit: ") + + if question.lower() == 'quit': + break + + # Embed the user's question + question_embedding = model.encode([question]) + + # Perform KNN search to find the best matches (indices and source text) + best_matches = knn_search(question_embedding, allEmbeddings, k=10) + + + sourcetext="" + for i, (index, source_text) in enumerate(best_matches, start=1): + sourcetext += f"{i}. Index: {index}, Source Text: {source_text}" + + systemPrompt = f"Only use the following information to answer the question. Do not use anything else: {sourcetext}" + + url = "http://localhost:11434/api/generate" + + payload = { + "model": "mistral-openorca", + "prompt": question, + "system": systemPrompt, + "stream": False, + "context": context + } + + # Convert the payload to a JSON string + payload_json = json.dumps(payload) + + # Set the headers to specify JSON content + headers = { + "Content-Type": "application/json" + } + + # Send the POST request + response = requests.post(url, data=payload_json, headers=headers) + + # Check the response + if response.status_code == 200: + output = json.loads(response.text) + context = output['context'] + print(output['response']+ "\n") + + + else: + print(f"Request failed with status code {response.status_code}") + diff --git a/examples/python-rag-newssummary/utils.py b/examples/python-rag-newssummary/utils.py new file mode 100644 index 00000000..0bce011b --- /dev/null +++ b/examples/python-rag-newssummary/utils.py @@ -0,0 +1,108 @@ +import curses +import feedparser +import requests +import unicodedata +import json +from newspaper import Article +from bs4 import BeautifulSoup +from nltk.tokenize import sent_tokenize, word_tokenize +import numpy as np +from sklearn.neighbors import NearestNeighbors +from mattsollamatools import chunker + +# Create a dictionary to store topics and their URLs +topic_urls = { + "Mac": "https://9to5mac.com/guides/mac/feed", + "News": "http://www.npr.org/rss/rss.php?id=1001", + "Nvidia": "https://nvidianews.nvidia.com/releases.xml", + "Raspberry Pi": "https://www.raspberrypi.com/news/feed/", + "Music": "https://www.billboard.com/c/music/music-news/feed/" +} + +# Use curses to create a menu of topics +def menu(stdscr): + chosen_topic = get_url_for_topic(stdscr) + url = topic_urls[chosen_topic] if chosen_topic in topic_urls else "Topic not found" + + stdscr.addstr(len(topic_urls) + 3, 0, f"Selected URL for {chosen_topic}: {url}") + stdscr.refresh() + + return chosen_topic + +# You have chosen a topic. Now return the url for that topic +def get_url_for_topic(stdscr): + curses.curs_set(0) # Hide the cursor + stdscr.clear() + + stdscr.addstr(0, 0, "Choose a topic using the arrow keys (Press Enter to select):") + + # Create a list of topics + topics = list(topic_urls.keys()) + current_topic = 0 + + while True: + for i, topic in enumerate(topics): + if i == current_topic: + stdscr.addstr(i + 2, 2, f"> {topic}") + else: + stdscr.addstr(i + 2, 2, f" {topic}") + + stdscr.refresh() + + key = stdscr.getch() + + if key == curses.KEY_DOWN and current_topic < len(topics) - 1: + current_topic += 1 + elif key == curses.KEY_UP and current_topic > 0: + current_topic -= 1 + elif key == 10: # Enter key + return topic_urls[topics[current_topic]] + +# Get the last N URLs from an RSS feed +def getUrls(feed_url, n=20): + feed = feedparser.parse(feed_url) + entries = feed.entries[-n:] + urls = [entry.link for entry in entries] + return urls + +# Often there are a bunch of ads and menus on pages for a news article. This uses newspaper3k to get just the text of just the article. +def getArticleText(url): + article = Article(url) + article.download() + article.parse() + return article.text + +def get_summary(text): + systemPrompt = "Write a concise summary of the text, return your responses with 5 lines that cover the key points of the text given." + prompt = text + + url = "http://localhost:11434/api/generate" + + payload = { + "model": "mistral-openorca", + "prompt": prompt, + "system": systemPrompt, + "stream": False + } + payload_json = json.dumps(payload) + headers = {"Content-Type": "application/json"} + response = requests.post(url, data=payload_json, headers=headers) + + return json.loads(response.text)["response"] + +# Perform K-nearest neighbors (KNN) search +def knn_search(question_embedding, embeddings, k=5): + X = np.array([item['embedding'] for article in embeddings for item in article['embeddings']]) + source_texts = [item['source'] for article in embeddings for item in article['embeddings']] + + # Fit a KNN model on the embeddings + knn = NearestNeighbors(n_neighbors=k, metric='cosine') + knn.fit(X) + + # Find the indices and distances of the k-nearest neighbors + distances, indices = knn.kneighbors(question_embedding, n_neighbors=k) + + # Get the indices and source texts of the best matches + best_matches = [(indices[0][i], source_texts[indices[0][i]]) for i in range(k)] + + return best_matches