import React, { useEffect, useRef, useState } from 'react';
import './LLMsWork.css';
import openAFIImage1 from '../assets/pretraining_finetuning.png';
import openAFIImage2 from '../assets/f117.png';
import openAFIImage3 from '../assets/jet_1.jpg';
import openAFIImage4 from '../assets/sadant.jpg';
import openAFIImage5 from '../assets/useravi.jpg';
import openAFIImage6 from '../assets/pretraining_finetuning.png';
import openAFIImage7 from '../assets/pretraining_finetuning.png';
import openAFIImage8 from '../assets/pretraining_finetuning.png';
import openAFIImage9 from '../assets/pretraining_finetuning.png';
import openAFIImage10 from '../assets/pretraining_finetuning.png';

function LLMsWork() {
  const [currentSection, setCurrentSection] = useState(0);
  const sectionRefs = useRef([]);

  const isMobile = () => {
    return /Android|webOS|iPhone|iPad|iPod|BlackBerry|IEMobile|Opera Mini/i.test(navigator.userAgent);
  };

  const sections = [
    {
      title: "1. Data Collection",
      content: "Initially, every single Air Force Instruction (AFI) was downloaded onto my hard drive. E-pubs is routinely checked for new AFIs and AFI updates, which are then downloaded to ensure the AFI archive is always up to date.",
      image: openAFIImage2
    },
    {
      title: "2. Preprocessing",
      content: "Once collected, the AFI documents undergo a rigorous preprocessing phase. This crucial step prepares the raw text for analysis and embedding creation, ensuring that the data is clean, standardized, and ready for further processing. The preprocessing involves several key steps:\n\n• Cleaning the Text: Removing formatting artifacts, special characters, and irrelevant information to ensure that only meaningful text is retained. This step helps to eliminate noise and improve the quality of the data.\n\n• Standardizing Document Structures: Ensuring consistency across different AFIs by standardizing document structures. This involves aligning the format and layout of various documents to a common standard.\n\n• Tokenization: Breaking down the text into individual words or subwords, which allows for more detailed analysis. Tokenization helps in understanding the composition of the text and prepares it for further processing.\n\n• Lowercasing and Removing Punctuation: Reducing variability in the text by converting all characters to lowercase and removing punctuation marks. This step helps in treating words uniformly, regardless of their original case or punctuation.\n\n• Removing Stop Words: Eliminating common words like 'the', 'and', 'is' that do not carry significant meaning. This helps in focusing on the more meaningful and informative parts of the text.\n\n• Stemming or Lemmatization: Reducing words to their base or dictionary form, which helps in standardizing different forms of the same word. For example, 'running' might be reduced to 'run'.\n\n• Handling Acronyms and Military-Specific Terminology: Special attention is given to acronyms and military-specific terminology to ensure they are correctly interpreted and processed. This step is crucial for accurately capturing the meaning of specialized terms used in the AFIs.",
      image: openAFIImage3
    },
    {
      title: "3. Embedding Creation",
      content: "We create the embeddings locally using a BERT-based model from the preprocessed data. This process involves generating high-dimensional vector representations that capture a reasonable amount of nuance and meaning from the text. The key steps in this process are:\n\n• Using a BERT-Based Model: Leveraging a pre-trained BERT-based model to create embeddings from the preprocessed AFI documents. This model is well-suited for understanding and representing the text's semantic content.\n\n• Creating High-Dimensional Vector Representations: Generating high-dimensional vector embeddings for each chunk of text. These embeddings encapsulate the meaning and context of the text, allowing for nuanced understanding and analysis.\n\n• Capturing Semantic Relationships: The embeddings are designed to capture the meaning and context of the text beyond simple keyword matching. This allows the system to understand nuanced relationships between different parts of the text.\n\n• Efficient Processing and Analysis: The embedding process transforms the text into a format that machines can process efficiently. This facilitates sophisticated analysis, comparison, and retrieval of information based on the text's meaning and context.",
      image: openAFIImage4
    },
    {
      title: "4. Metadata Tagging",
      content: "To ensure traceability and attribution, each embedding is tagged with metadata. This step is crucial for providing accurate references in the system's responses and maintaining the integrity of the information. The metadata tagging process includes several key steps:\n\n• Extracting Relevant Information: Extracting important metadata from each AFI document, such as:\n  - Document Title and Identifier: Capturing the official title and unique identifier for each document.\n  - Publication Date and Version Number: Recording the publication date and version number to ensure the information is current and accurate.\n  - Section and Subsection Numbers: Identifying specific sections and subsections within the documents to provide precise references.\n  - Chapter Titles and Subtitles: Including chapter titles and subtitles to give context to the information.\n\n• Associating Metadata with Embeddings: Linking the extracted metadata with the corresponding text embeddings. This association ensures that each piece of information can be accurately traced back to its source.\n\n• Creating a Robust Linking System: Developing a robust system that links embeddings to their source documents. This system enables users to easily verify the information and trace it back to the original document.",
      image: openAFIImage5
    },
    {
      title: "5. Vector Database Storage",
      content: "The embeddings, along with their associated metadata, are stored in a specialized vector database. This type of database is optimized for storing and querying high-dimensional vectors efficiently. Key aspects of this storage system include:\n\n• Indexing: Creating efficient index structures (e.g., HNSW, IVF) to enable fast similarity searches. These structures allow the database to quickly find and retrieve the most relevant embeddings based on the user's query.\n\n• Scalability: Designing the database to handle millions of embeddings and support concurrent queries. This ensures that the system can scale to accommodate a growing number of documents and users without sacrificing performance.\n\n• Update Mechanisms: Implementing processes to update the database as new AFIs are published or existing ones are revised. This ensures that the database always contains the most current information.\n\n• Compression Techniques: Applying methods to reduce storage requirements while maintaining search accuracy. Compression helps to manage storage costs and improve query performance without compromising the quality of the results.\n\n• Integration with Metadata: Ensuring seamless connections between vector data and associated metadata. This integration allows for efficient retrieval of both the embeddings and their corresponding metadata, providing comprehensive and accurate responses to user queries.",
      image: openAFIImage6
    },
    {
      title: "6. User Query Processing",
      content: "When a user submits a query, OpenAFI processes it to find the most relevant information. This involves several key steps to ensure that the query is accurately understood and matched with the appropriate content:\n\n• Applying Preprocessing Steps: The same preprocessing steps used for the AFI documents are applied to the user's query. This includes cleaning the text, tokenization, lowercasing, removing punctuation, and handling any special terminology. This ensures consistency between the query and the stored AFI documents.\n\n• Creating a Vector Embedding via LLM API: A vector embedding of the preprocessed query is created using a Large Language Model (LLM) API with the same dimensions as the embeddings created for the AFI documents. This embedding captures the semantic meaning of the user's question, allowing the system to understand the context and nuances of the query.\n\n• Context-Aware Searching: The query embedding is used for context-aware searching in the vector database. By representing the query in the same high-dimensional space as the AFI embeddings, the system can effectively match the query with the most relevant information, even if the wording differs.",
      image: openAFIImage7
    },
    {
      title: "7. Vector Similarity Search",
      content: "The system performs a similarity search in the vector database to find the most relevant AFI content. This process includes several key steps:\n\n• Using Efficient Algorithms: The system uses efficient algorithms to compute the similarity between the query embedding and the stored AFI embeddings. These algorithms are optimized for fast and accurate retrieval of relevant information.\n\n• Similarity Metrics: Similarity is typically measured using metrics like cosine similarity or Euclidean distance. These metrics help determine how closely the query embedding matches the stored embeddings.\n\n• Retrieving Top Matches: The system retrieves the top N most similar embeddings and their associated metadata. This ensures that the most relevant information is identified and made available for further processing.\n\n• Context-Aware Information Retrieval: This approach allows OpenAFI to find relevant information even when the exact wording differs between the query and the AFI content. By focusing on semantic similarity, the system can understand the meaning and context of the query and match it with the appropriate content.",
      image: openAFIImage8
    },
    {
      title: "8. Content Retrieval and Ranking",
      content: "Based on the similarity search results, OpenAFI retrieves the relevant content from the AFIs. This step involves several key processes to ensure that the most pertinent information is presented to the user:\n\n• Fetching Full Text: Retrieving the full text associated with the top-matching embeddings. This ensures that the entire context of the relevant sections is available for further analysis.\n\n• Applying Additional Ranking Algorithms: Using additional ranking algorithms to refine the relevance of the retrieved content. These algorithms help prioritize the most useful and specific information based on the query.\n\n• Considering Multiple Factors: Taking into account various factors such as the recency of the AFI, the specificity of the match, and the hierarchical structure of the documents. This ensures that the most current, relevant, and contextually appropriate information is prioritized.",
      image: openAFIImage9
    },
    {
      title: "9. Response Generation",
      content: "Using the retrieved and ranked content, OpenAFI generates a comprehensive response to the user's query. This process involves several key steps to ensure that the response is clear, accurate, and contextually appropriate:\n\n• Synthesizing Information: Combining information from multiple relevant sections if necessary. This ensures that the response is thorough and addresses all aspects of the user's query.\n\n• Employing Natural Language Generation: Utilizing natural language generation (NLG) techniques to create clear, coherent answers. This helps in presenting the information in a readable and understandable format.\n\n• Maintaining Accuracy: Ensuring that the response directly addresses the user's question while maintaining accuracy. This involves cross-referencing the retrieved content to confirm its relevance and correctness.\n\n• Structuring the Response: Organizing the response to provide context and easy-to-follow explanations. This includes breaking down complex information into simpler parts and using headings or bullet points where appropriate.",
      image: openAFIImage10
    },
    {
      title: "10. Source Attribution and Presentation",
      content: "Finally, OpenAFI provides source attribution for its responses and presents the information to the user. This step ensures transparency and allows users to verify the information. The process involves several key steps:\n\n• Including References: Providing references to the specific AFIs, sections, and paragraphs used to generate the response. This ensures that users can trace the information back to its original source.\n\n• Formatting for Readability: Formatting the response for easy readability, potentially including relevant quotes from the AFIs. This helps in presenting the information in a clear and accessible manner.",
      image: openAFIImage1
    }
  ];

  useEffect(() => {
    if (isMobile()) return;

    const observerOptions = {
      root: null,
      rootMargin: '-20% 0px -20% 0px',
      threshold: 0.5
    };

    const observerCallback = (entries) => {
      entries.forEach((entry) => {
        if (entry.isIntersecting) {
          const index = parseInt(entry.target.getAttribute('data-index'));
          setCurrentSection(index);
          entry.target.classList.add('visible');
        } else {
          entry.target.classList.remove('visible');
        }
      });
    };

    const observer = new IntersectionObserver(observerCallback, observerOptions);

    sectionRefs.current.forEach((ref) => {
      if (ref) observer.observe(ref);
    });

    return () => {
      sectionRefs.current.forEach((ref) => {
        if (ref) observer.unobserve(ref);
      });
    };
  }, []);

  const addToRefs = (el) => {
    if (el && !sectionRefs.current.includes(el)) {
      sectionRefs.current.push(el);
    }
  };

  const renderContent = (content) => {
    return content.split('\n').map((line, index) => (
      <React.Fragment key={index}>
        {line}
        <br />
      </React.Fragment>
    ));
  };

  return (
    <div className='llms-work'>
      {sections.map((section, index) => (
        <div 
          key={index} 
          className={`section-container ${!isMobile() ? 'fade-in' : ''}`} 
          ref={addToRefs} 
          data-index={index}
        >
          <div className='content'>
            <div className='text-content'>
              <h2>{section.title}</h2>
              <p>{renderContent(section.content)}</p>
            </div>
            <div className='image-content'>
              <img src={section.image} alt={`OpenAFI Process Step ${index}`} className='img' />
            </div>
          </div>
        </div>
      ))}
    </div>
  );
}

export default LLMsWork;