-
Notifications
You must be signed in to change notification settings - Fork 7
/
Makefile
129 lines (106 loc) · 5.04 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
.PHONY: help
help:
@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m<target>\033[0m\n"} /^[a-zA-Z_-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 }' $(MAKEFILE_LIST)
format_code_black:
@# Format all python files
pylsp.black --line-length 100 .
format_code_ruff:
@# Format all python files
ruff format --line-length 100 .
reorder_json_chunks:
@# Reorder and clean the json database.
# Usefull to compare different version of this file.
cat ~/Downloads/files_as_chunks.json | jq 'sort_by(.url) | map(del(.file))' > tmp.json
info:
@echo "Number of sheets: $$(cat _data/sheets_as_chunks.json | jq '.[] .url' | sort | uniq | wc -l)"
@echo "Number of chunks: $$(cat _data/sheets_as_chunks.json | jq 'length')"
@echo "Number of questions (from sheets): $$(cat _data/questions.json | jq 'length')"
@echo
@cat _data/chunks.info
fetch_colab_notebooks:
wget 'https://colab.research.google.com/drive/1_FQw20VjpKaE-Al-dh4jfVRtPawbD0fe' -O notebooks/llama-finetuning-7b-4bit.ipynb
wget 'https://colab.research.google.com/drive/148aZEs2-3hkCeTya1h4YPdfpqGIL5A4p' -O notebooks/llama-inference-7b-4bit.ipynb
download_experiences:
wget https://opendata.plus.transformation.gouv.fr/api/explore/v2.1/catalog/datasets/export-expa-c-riences/exports/json
download_servicepublic_sheets:
# Download xml files from:
# https://www.data.gouv.fr/fr/datasets/service-public-fr-guide-vos-droits-et-demarches-particuliers/
# https://www.data.gouv.fr/fr/datasets/service-public-fr-guide-vos-droits-et-demarches-entreprendre/
# https://www.data.gouv.fr/fr/datasets/service-public-fr-guide-vos-droits-et-demarches-associations/
download_travailemploie_sheets:
# wget https://github.com/SocialGouv/fiches-travail-data/raw/master/data/fiches-travail.json
institutions:
cat _data/export-expa-c-riences.json | jq 'map(.intitule_typologie_1) | unique | map(select(. != null))' > _data/institutions.json
acronyms_directory:
@# -> acronyms_directory.text
@rg '"nom"' _data/directory/national_data_directory.json | grep ')",$$' | cut -d: -f 2 | grep -oP '(?<=\").*(?=\")' | grep -E '\([A-Z0-9][0-9a-zA-Z]{2,}\)' | sort | uniq
acronyms_sp:
@# -> acronyms_sp.text
@find -iname "*.xml" | xargs xmllint --xpath "//*[name()='OuSAdresser']/Titre/text() | //Fiche//Titre/text()" 2>/dev/null | grep -oE '.*\([A-Z0-9][0-9a-zA-Z]{2,}\)' | sort | uniq
acronyms: #acronyms_directory acronyms_sp
# filter lines with more than one acronym
cat acronyms_sp.txt acronyms_directory.txt > acronyms.txt
cat acronyms.txt | sort | uniq > acronyms.1.txt
grep -E '(.*\(.*){2,}' acronyms.1.txt > acronyms_dup.txt
grep -v -F -f acronyms_dup.txt acronyms.1.txt > _data/acronyms.txt
rm acronyms.txt acronyms.1.txt acronyms_dup.txt
rm acronyms_sp.txt acronyms_directory.txt
# @Warning: Duplicate have been process manually !
# @todo:delete: Nantes, Montpellier, Toulouse, Secrétariat, Guangzhou, Fondatation
# @todo:delete: CES, BUDGET, Sacem, CIO, Inee, CDC
# @todo:add: CNI
# ./script/acronyms_to_json.py
build_llama.cpp:
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
## @DEBUG/Upgrade
git reset --hard "dadbed9" #Stick to the older version until gguf is fully supported
# /
make
convert_model_for_cpu:
python llama.cpp/convert.py <model> -outfile <outfile>
python llama.cpp/quantize <outfile> <outquantfile> q4_K
build_all_indexes: # not embeddings
# elasticsearch
python3 pyalbert.py index experiences --index-type bm25
python3 pyalbert.py index sheets --index-type bm25
python3 pyalbert.py index chunks --index-type bm25
python3 pyalbert.py index experiences --index-type e5
python3 pyalbert.py index chunks --index-type e5
clean_all_indexex: # not embeddings
# elasticsearch
curl -XDELETE http://localhost:9202/experiences
curl -XDELETE http://localhost:9202/sheets
curl -XDELETE http://localhost:9202/chunks
# meillisearch
#curl -X DELETE http://localhost:7700/indexes/experiences
#curl -X DELETE http://localhost:7700/indexes/sheets
#curl -X DELETE http://localhost:7700/indexes/chunks
# qdrant
curl -X DELETE http://localhost:6333/collections/experiences
curl -X DELETE http://localhost:6333/collections/chunks
list_indexes:
# elasticsearch
curl -X GET "http://localhost:9202/_cat/indices?v"
# qdrant
curl -X GET "http://localhost:6333/collections" | jq
OSC_PROFILE="default" # Usage: make list_vms OSC_PROFILE="cloudgouv"
VMID="i-3fcd96ff" # Usage: make get_vm VMID=i-bb5568c0
list_vms:
@osc-cli api ReadVms --profile "$(OSC_PROFILE)" | jq ".Vms | .[] | { VmId, State, Tags }"
get_vm:
@osc-cli api ReadVms --profile "$(OSC_PROFILE)" \
--Filters "{\
\"VmIds\": [\"$(VMID)\"],\
}" | jq ".Vms | .[] | { VmId, State, Tags }"
start_vm:
@osc-cli api StartVms --profile "$(OSC_PROFILE)" --VmIds "[\"$(VMID)\"]"
stop_vm:
@read -p "Please confirm to STOP this VM ($(VMID))? (y/n) " answer;\
answer=$$(echo $$answer | tr '[:upper:]' '[:lower:]'); \
if [ "$$answer" = "y" ] || [ "$$answer" = "yes" ]; then \
echo "You answered yes. Continuing..."; \
else \
echo "You answered no. Stopping..."; exit 1; \
fi
@osc-cli api StopVms --profile "$(OSC_PROFILE)" --VmIds "[\"$(VMID)\"]"