calc_llm_list.py

text = """
| Meta                                       | [LLaMA](https://github.com/facebookresearch/llama)                                                                   | en                       | -                                          | LLaMA-13B outperforms GPT-3(175B) and LLaMA-65B is competitive to PaLM-540M.<br />Base model for most follow-up works.                                                                                                                                                                                                                                                     |
| @ggerganov                                 | [llama.cpp](https://github.com/ggerganov/llama.cpp)                                                                  | en                       | LLaMA                                      | c/cpp implementation for llama and some other models, using quantization.                                                                                                                                                                                                                                                                                                 |
| Stanford                                   | [Alpaca](https://github.com/tatsu-lab/stanford_alpaca)                                                               | en                       | LLaMA/OPT                                  | use 52K instruction-following data generated by Self-Instructt techniques to fine-tune 7B LLaMA,<br /> the resulting model,  Alpaca, behaves similarly to the `text-davinci-003` model on the Self-Instruct instruction-following evaluation suite.<br />Alpaca has inspired many follow-up models.                                                                 |
| LianJiaTech                                | [BELLE](https://github.com/LianjiaTech/BELLE)                                                                        | en/zh                    | BLOOMZ-7B1-mt                              | maybe the first Chinese model to follow Alpaca.                                                                                                                                                                                                                                                                                                                            |
| Tsinghua                                   | [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B)                                                                    | en/zh                    | GLM                                        | well-known Chinese model, in chat mode, and can run on single GPU.                                                                                                                                                                                                                                                                                                         |
| Databricks                                 | [Dolly](https://github.com/databrickslabs/dolly)                                                                     | en                       | GPT-J 6B                                   | use Alpaca data to fine-tune a 2-year-old model: GPT-J, which exhibits surprisingly high quality<br /> instruction following behavior not characteristic of the foundation model on which it is based.                                                                                                                                                                    |
| @tloen                                     | [Alpaca-LoRA](https://github.com/tloen/alpaca-lora)                                                                  | en                       | LLaMA-7B                                   | trained within hours on a single RTX 4090,<br />reproducing the [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca) results using [low-rank adaptation (LoRA)](https://arxiv.org/pdf/2106.09685.pdf),<br />and can run on a Raspberry pi.                                                                                                                           |
| ColossalAI                                 | [ColossalChat](https://github.com/hpcaitech/ColossalAI/blob/main/applications/Chat/README.md)                        | en/zh                    | LLaMA-7B                                   | provides a unified large language model framework, including:<br />Supervised datasets collection<br />Supervised instructions fine-tuning<br />Reward model training<br />RLHF<br />Quantization inference<br />Fast model deploying<br />Perfectly integrated with the Hugging Face ecosystem                                                                            |
| Shanghai AI Lab                            | [LLaMA-Adapter](https://github.com/ZrrSkywalker/LLaMA-Adapter)                                                       | en                       | LLaMA-7B                                   | Fine-tuning LLaMA to follow instructions within 1 Hour and 1.2M Parameters                                                                                                                                                                                                                                                                                                 |
| PhoebusSi                                  | [Alpaca-CoT](https://github.com/PhoebusSi/Alpaca-CoT)                                                                | en/zh                    | LLaMA<br />ChatGLM-6B<br />BLOOM           | extend CoT data to Alpaca to boost its reasoning ability.<br />aims to build an instruction finetuning (IFT) platform with extensive instruction collection (especially the CoT datasets)<br /> and a unified interface for various large language models.                                                                                                                 |
| AetherCortex                               | [Llama-X](https://github.com/AetherCortex/Llama-X)                                                                   | en                       | LLaMA                                      | Open Academic Research on Improving LLaMA to SOTA LLM                                                                                                                                                                                                                                                                                                                      |
| TogetherComputer                           | [OpenChatKit](https://github.com/togethercomputer/OpenChatKit)                                                       | en                       | GPT-NeoX-20B                               | OpenChatKit provides a powerful, open-source base to create both specialized and general purpose chatbots for various applications.<br /> The kit includes an instruction-tuned language models, a moderation model, and an extensible retrieval system for including <br />up-to-date responses from custom repositories.                                                 |
| nomic-ai                                   | [GPT4All](https://github.com/nomic-ai/gpt4all)                                                                       | en                       | LLaMA                                      | trained on a massive collection of clean assistant data including code, stories and dialogue                                                                                                                                                                                                                                                                               |
| @ymcui                                     | [Chinese-LLaMA-Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca)                                                | en/zh                    | LLaMA-7B/13B                               | expand the Chinese vocabulary based on the original LLaMA and use Chinese data for secondary pre-training,<br /> further enhancing Chinese basic semantic understanding. Additionally, the project uses Chinese instruction data<br /> for fine-tuning on the basis of the Chinese LLaMA, significantly improving the model's understanding and execution of instructions. |
| UC Berkley<br />Stanford<br />CMU          | [Vicuna](https://github.com/lm-sys/FastChat)                                                                         | en                       | LLaMA-13B                                  | Impressing GPT-4 with 90% ChatGPT Quality                                                                                                                                                                                                                                                                                                                                  |
| @NouamaneTazi                              | [bloomz.cpp](https://github.com/NouamaneTazi/bloomz.cpp)                                                             | en/zh                    | BLOOM                                      | C++ implementation for BLOOM inference.                                                                                                                                                                                                                                                                                                                                    |
| HKUST                                      | [LMFlow](https://github.com/OptimalScale/LMFlow) / [RAFT](https://optimalscale.github.io/LMFlow/examples/raft.html)    | en/zh                    | LLaMA<br />Galatica<br />GPT-2<br />...    | LMFlow is an extensible, convenient, and efficient toolbox for finetuning large machine learning models, designed to be user-friendly,<br /> speedy and reliable, and accessible to the entire community.<br />RAFT is a new alignment algorithm, which is more efficient than conventional (PPO-based) RLHF.                                                             |
| [Cerebras Systems](https://www.cerebras.net/) | [Cerebras-GPT](https://huggingface.co/cerebras/Cerebras-GPT-13B)                                                     | en                       | -                                          | Pretrained LLM, GPT-3 like, Commercially available, efficiently trained on the[Andromeda](https://www.cerebras.net/andromeda/) AI supercomputer,<br />trained in accordance with [Chinchilla scaling laws](https://arxiv.org/abs/2203.15556) (20 tokens per model parameter) which is compute-optimal.                                                                          |
| UT Southwestern/<br />UIUC/OSU/HDU         | [ChatDoctor](https://github.com/Kent0n-Li/ChatDoctor)                                                                | en                       | LLaMA                                      | Maybe the first domain-specific chat model tuned on LLaMA.                                                                                                                                                                                                                                                                                                                 |
| LAION-AI                                   | [Open Assistant](https://github.com/LAION-AI/Open-Assistant)                                                         | en                       | GPT-J<br />CodeGen<br />FlanT5<br />GPT-JT | a project meant to give everyone access to a great chat based large language model.                                                                                                                                                                                                                                                                                        |
| UCSD/SYSU                                  | [baize](https://github.com/project-baize/baize)                                                                      | en<br />zh(comming soon) | LLaMA                                      | fine-tuned with[LoRA](https://github.com/microsoft/LoRA). It uses 100k dialogs generated by letting ChatGPT chat with itself. <br />Alpaca's data is also used to improve its performance.                                                                                                                                                                                    |
| UC Berkley                                 | [Koala](https://github.com/young-geng/EasyLM)                                                                        | en                       | LLaMA                                      | Rather than maximizing*quantity* by scraping as much web data as possible, the team focus on collecting a small *high-quality* dataset.                                                                                                                                                                                                                                |
| @imClumsyPanda                             | [langchain-ChatGLM](https://github.com/imClumsyPanda/langchain-ChatGLM)                                              | en/zh                    | ChatGLM-6B                                 | local knowledge based ChatGLM with langchain.                                                                                                                                                                                                                                                                                                                              |
| @yangjianxin1                              | [Firefly](https://github.com/yangjianxin1/Firefly)                                                                   | zh                       | bloom-1b4-zh<br />bloom-2b6-zh             | Instruction Tuning on Chinese dataset. Vocabulary pruning, ZeRO, and tensor parallelism<br /> are used to effectively reduce memory consumption and improve training efficiency.                                                                                                                                                                                           |
| microsoft                                  | [GPT-4-LLM](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)                                              | en/zh                    | LLaMA                                      | aims to share data generated by GPT-4 for building an instruction-following LLMs with supervised learning and reinforcement learning.                                                                                                                                                                                                                                      |
| EleutherAI                                 | [pythia](https://github.com/EleutherAI/pythia)                                                                       | en                       | -                                          | combine interpretability analysis and scaling laws to understand how knowledge develops<br /> and evolves during training in autoregressive transformers.                                                                                                                                                                                                                  |
| Hugging Face                               | [StackLLaMA](https://huggingface.co/trl-lib/llama-7b-se-rl-peft)                                                     | en                       | LLaMA                                      | trained on StackExchange data and the main goal is to serve as a tutorial and walkthrough on<br /> how to train model with RLHF and not primarily model performance.                                                                                                                                                                                                      |
| Nebuly                                     | [ChatLLaMA](https://github.com/nebuly-ai/nebullvm/tree/main/apps/accelerate/chatllam)                                | en                       | -                                          | a library that allows you to create hyper-personalized ChatGPT-like assistants using your own data and the least amount of compute possible.                                                                                                                                                                                                                              |
| @juncongmoo                                | [ChatLLaMA](https://github.com/juncongmoo/chatllama)                                                                 | en                       | LLaMA                                      | LLaMA-based RLHF model, runnable in a single GPU.                                                                                                                                                                                                                                                                                                                         |
| @juncongmoo                                | [minichatgpt](https://github.com/juncongmoo/minichatgpt)                                                             | en                       | GPT/OPT ...                                | To Train ChatGPT In 5 Minutes with ColossalAI.                                                                                                                                                                                                                                                                                                                             |
| @LC1332                                    | [Luotuo-Chinese-LLM](https://github.com/LC1332/Luotuo-Chinese-LLM)                                                   | zh                       | LLaMA/ChatGLM                              | Instruction fine-tuned Chinese Language Models, with colab provided!                                                                                                                                                                                                                                                                                                       |
| @Facico                                    | [Chinese-Vicuna](https://github.com/Facico/Chinese-Vicuna)                                                           | zh                       | LLaMA                                      | A Chinese Instruction-following LLaMA-based Model, fine-tuned with Lora, cpp inference supported, colab provided.                                                                                                                                                                                                                                                          |
| @yanqiangmiffy                             | [InstructGLM](https://github.com/yanqiangmiffy/InstructGLM)                                                          | en/zh                    | ChatGLM-6B                                 | ChatGLM based instruction-following model, fine-tuned on a variety of data sources, supports deepspeed accelerating and LoRA.                                                                                                                                                                                                                                            |
| alibaba                                    | [Wombat](https://github.com/GanjinZero/RRHF)                                                                         | en                       | LLaMA                                      | a novel learning paradigm called RRHF, as an alternative of RLHF,  is proposed, which scores responses generated by<br /> different sampling policies and learns to align them with human preferences through ranking loss. And the performance<br />is comparable to RLHF, with less models used in the process.                                                      |
| microsoft                                  | [deepspeed-chat](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat)                            | -                        | -                                          | Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales.                                                                                                                                                                                                                                                                                              |
| @WuJunde                                   | [alpaca-glassoff](https://github.com/WuJunde/alpaca-glassoff)                                                        | en                       | LLaMA                                      | a mini image-acceptable Chat AI can run on your own laptop,  based on[stanford-alpaca](https://github.com/tatsu-lab/stanford_alpaca) and [alpaca-lora](https://github.com/tloen/alpaca-lora).                                                                                                                                                                                   |
| Cambridge                                  | [Visual Med-Alpaca](https://github.com/cambridgeltl/visual-med-alpaca)                                               | en                       | LLaMA-7B                                   | a multi-modal foundation model designed specifically for the biomedical domain                                                                                                                                                                                                                                                                                             |
| @JosephusCheung                            | [Guanaco](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)                                             | en/zh/jp/de              | LLaMA-7B                                   | A Multilingual Instruction-Following Language Model                                                                                                                                                                                                                                                                                                                        |
| KAUST                                      | [CAMEL](https://github.com/lightaime/camel)                                                                          | en/zh/jp/de ...          | LLaMA                                      | a novel communicative agent framework named*role-playing,* using *inception prompting* to<br /> guide chat agents toward task completion while maintaining consistency with human intentions.                                                                                                                                                                         |
| BaihaiAI                                   | [IDPChat](https://github.com/BaihaiAI/IDPChat)                                                                       | en/zh                    | LLaMA-13B<br />Stable-diffusion            | Chinese multi-modal model, single GPU runnable, easy to deploy, UI provided.                                                                                                                                                                                                                                                                                               |
| BlinkDL                                    | [ChatRWKV](https://github.com/BlinkDL/ChatRWKV)                                                                      | en/zh                    | **RNN**                              | powered by RWKV (**100% RNN)**, Training sponsored by Stability EleutherAI.                                                                                                                                                                                                                                                                                          |
| @FreedomIntelligence                       | [LLM Zoo](https://github.com/FreedomIntelligence/LLMZoo)                                                             | multi                    | BLOOMZ/LLaMA                               | a project that provides data, models, and evaluation benchmark for large language models.<br />model released: Phoenix, Chimera                                                                                                                                                                                                                                           |
| KAUST                                      | [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4)                                                                | en/zh                    | LLaMA                                      | MiniGPT-4 aligns a frozen visual encoder from BLIP-2 with a frozen LLM, Vicuna, using just one projection layer,<br /> and yields many emerging vision-language capabilities similar to those demonstrated in GPT-4.                                                                                                                                                      |
| HIT                                        | [Huatuo](https://github.com/SCIR-HI/Huatuo-Llama-Med-Chinese) / [ChatGLM-Med](https://github.com/SCIR-HI/Med-ChatGLM) | zh                       | LLaMA/ChatGLM                              | fine-tuned with Chinese medical knowledge dataset, which is generated by using gpt3.5 api.                                                                                                                                                                                                                                                                                 |
| UW–Madison/MSR<br />/Columbia University  | [LLaVA](https://github.com/haotian-liu/LLaVA)                                                                        | en                       | LLaMA                                      | visual instruction tuning is proposed, towards building large language and vision models with GPT-4 level capabilities.                                                                                                                                                                                                                                                    |
| Stability-AI                               | [StableLM](https://github.com/Stability-AI/StableLM)                                                                 | en                       | -                                          | Stability AI Language Models.                                                                                                                                                                                                                                                                                                                                              |
| ShanghaiTech, etc                          | [DoctorGLM](https://github.com/xionghonglin/DoctorGLM)                                                               | en/zh                    | ChatGLM-6B                                 | Chinese medical consultation model fine-tuned on ChatGLM-6B.                                                                                                                                                                                                                                                                                                              |
| TogetherComputer                           | [RedPajama-Data](https://github.com/togethercomputer/RedPajama-Data)                                                 | en                       | -                                          | An Open Source Recipe to Reproduce LLaMA training dataset.                                                                                                                                                                                                                                                                                                                 |
| FDU                                        | [MOSS](https://github.com/OpenLMLab/MOSS)                                                                            | en/zh                    | -                                          | An open-source tool-augmented conversational language model from Fudan University.                                                                                                                                                                                                                                                                                         |
| ssymmetry & FDU                            | [BBT-2](https://bbt.ssymmetry.com/)                                                                                  | zh                       | -                                          | 120B open-source LM.                                                                                                                                                                                                                                                                                                                                                       |
| Tsinghua AIR                               | [BioMedGPT-1.6B](https://github.com/BioFM/OpenBioMed)                                                                | en/zh                    | -                                          | a pre-trained multi-modal molecular foundation model with 1.6B parameters that associates 2D molecular graphs with texts.                                                                                                                                                                                                                                                  |
"""

import re 
from bean.llm  import LLM
from datetime import date
import json
PATTERN_LINK = re.compile(r"\[(.*?)\]\((.*?)\)")
line = "| Meta                                       | [LLaMA](https://github.com/facebookresearch/llama)                                                                   | en                       | -                                          | LLaMA-13B outperforms GPT-3(175B) and LLaMA-65B is competitive to PaLM-540M.<br />Base model for most follow-up works.                                                                                                                                                                                                                                                     |"
def parse_line_to_json(i: int, line: str):
    elements = line.split("|")
    elements = [e.strip() for e in elements]
    # print(elements)
    vendor = elements[1]
    name, url = PATTERN_LINK.findall(elements[2])[0]
    intro = elements[5]
    llm = LLM(id=i, name=name,vendor=vendor,intro=intro,url=url,region="",publish_time=None)
    return llm.json()


# print(parse_line_to_json(1))
# names = []
# for line in text.splitlines():
#     if not line: continue
#     elements = line.split("|")
#     elements = [e.strip() for e in elements]
#     print(elements[2])
#     name, _ = PATTERN_LINK.findall(elements[2])[0]
#     names.append(name)
# print(names)

# llms = []
# for i, line in enumerate(text.splitlines()):
#     if not line: continue
#     obj = json.loads(parse_line_to_json(i, line))
#     del obj["voted"]
#     del obj["vote_count"]
#     llms.append(obj)
# json.dump(llms, open("llms.json", "w", encoding="utf-8"), ensure_ascii=False, indent=2)

# def get_github_repo_create_time(github_url: str):
#     import requests
#     import json
#     api_url = "https://api.github.com/repos/" + github_url.split("github.com/")[1]
#     resp = requests.get(api_url)
#     if resp.status_code != 200:
#         return None
#     data = json.loads(resp.text)
#     return data["created_at"]

if __name__ == "__main__":
    import json
    import re
    PATTERN_MD_LINK = re.compile(r"\[(.*?)\]\((.*?)\)")
    llms = json.load(open("llms.json", encoding="utf-8"))
    for llm in llms:
        intro = llm["intro"]
        # 把 [name](url) 替换成 <a href="url">name</a>
        intro = PATTERN_MD_LINK.sub(r'<a href="\2">\1</a>', intro)
        llm["intro"] = intro
        
    json.dump(llms, open("llms.json", "w", encoding="utf-8"), ensure_ascii=False, indent=2)