Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .github/workflows/pylint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install pylint
pip install -r ./hugegraph-llm/requirements.txt
pip install -r ./hugegraph-python-client/requirements.txt
- name: Analysing the code with pylint
run: |
pylint $(git ls-files '*.py')
export PYTHONPATH=$(pwd)/hugegraph-llm/src:$(pwd)/hugegraph-python-client/src
echo ${PYTHONPATH}
pylint --rcfile=./pylint.conf hugegraph-llm
pylint --rcfile=./pylint.conf hugegraph-python-client
42 changes: 27 additions & 15 deletions hugegraph-llm/examples/build_kg_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.


import os
from hugegraph_llm.operators.build_kg_operator import KgBuilder
from hugegraph_llm.llms.openai_llm import OpenAIChat
Expand All @@ -22,28 +24,35 @@
# If you need a proxy to access OpenAI's API, please set your HTTP proxy here
os.environ["http_proxy"] = ""
os.environ["https_proxy"] = ""
api_key = ""
API_KEY = ""

default_llm = OpenAIChat(
api_key=api_key, model_name="gpt-3.5-turbo-16k", max_tokens=4000
api_key=API_KEY,
model_name="gpt-3.5-turbo-16k",
max_tokens=4000,
)
text = (
"Meet Sarah, a 30-year-old attorney, and her roommate, James, whom she's shared a home with since 2010. James, "
"in his professional life, works as a journalist. Additionally, Sarah is the proud owner of the website "
"www.sarahsplace.com, while James manages his own webpage, though the specific URL is not mentioned here. "
"These two individuals, Sarah and James, have not only forged a strong personal bond as roommates but have "
"also carved out their distinctive digital presence through their respective webpages, showcasing their "
"varied interests and experiences."
TEXT = (
"Meet Sarah, a 30-year-old attorney, and her roommate, James, whom she's shared a home with"
" since 2010. James, in his professional life, works as a journalist. Additionally, Sarah"
" is the proud owner of the website www.sarahsplace.com, while James manages his own"
" webpage, though the specific URL is not mentioned here. These two individuals, Sarah and"
" James, have not only forged a strong personal bond as roommates but have also carved out"
" their distinctive digital presence through their respective webpages, showcasing their"
" varied interests and experiences."
)
builder = KgBuilder(default_llm)
# build kg with only text
builder.parse_text_to_data(text).disambiguate_data().commit_data_to_kg().run()
builder.parse_text_to_data(TEXT).disambiguate_data().commit_data_to_kg().run()
# build kg with text and schemas
nodes_schemas = [
{
"label": "Person",
"primary_key": "name",
"properties": {"age": "int", "name": "text", "occupation": "text"},
"properties": {
"age": "int",
"name": "text",
"occupation": "text",
},
},
{
"label": "Webpage",
Expand All @@ -58,12 +67,15 @@
"type": "roommate",
"properties": {"start": "int"},
},
{"start": "Person", "end": "Webpage", "type": "owns", "properties": {}},
{
"start": "Person",
"end": "Webpage",
"type": "owns",
"properties": {},
},
]
(
builder.parse_text_to_data_with_schemas(
text, nodes_schemas, relationships_schemas
)
builder.parse_text_to_data_with_schemas(TEXT, nodes_schemas, relationships_schemas)
.disambiguate_data_with_schemas()
.commit_data_to_kg()
.run()
Expand Down
3 changes: 3 additions & 0 deletions hugegraph-llm/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
openai==0.28.1
retry==0.9.2
tiktoken==0.5.1
2 changes: 2 additions & 0 deletions hugegraph-llm/src/hugegraph_llm/llms/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.


from abc import ABC, abstractmethod
from typing import Any, List, Optional, Callable

Expand Down
10 changes: 6 additions & 4 deletions hugegraph-llm/src/hugegraph_llm/llms/openai_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.


from typing import Callable, List, Optional
import openai
import tiktoken
Expand Down Expand Up @@ -56,10 +58,10 @@ def generate(
return str(f"Error: {e}")
# catch authorization errors / do not retry
except openai.error.AuthenticationError as e:
return "Error: The provided OpenAI API key is invalid"
return f"Error: The provided OpenAI API key is invalid, {e}"
except Exception as e:
print(f"Retrying LLM call {e}")
raise Exception()
raise Exception() from e

async def generate_streaming(
self,
Expand All @@ -86,11 +88,11 @@ async def generate_streaming(
await on_token_callback(message)
return result

def num_tokens_from_string(self, string: str) -> int:
async def num_tokens_from_string(self, string: str) -> int:
encoding = tiktoken.encoding_for_model(self.model)
num_tokens = len(encoding.encode(string))
return num_tokens

def max_allowed_token_length(self) -> int:
async def max_allowed_token_length(self) -> int:
# TODO: list all models and their max tokens from api
return 2049
14 changes: 5 additions & 9 deletions hugegraph-llm/src/hugegraph_llm/operators/build_kg_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.


from hugegraph_llm.operators.hugegraph_op.commit_data_to_kg import CommitDataToKg
from hugegraph_llm.operators.llm_op.disambiguate_data import DisambiguateData
from hugegraph_llm.operators.llm_op.parse_text_to_data import (
Expand All @@ -33,9 +35,7 @@ def parse_text_to_data(self, text: str):
self.parse_text_to_kg.append(ParseTextToData(llm=self.llm, text=text))
return self

def parse_text_to_data_with_schemas(
self, text: str, nodes_schemas, relationships_schemas
):
def parse_text_to_data_with_schemas(self, text: str, nodes_schemas, relationships_schemas):
self.parse_text_to_kg.append(
ParseTextToDataWithSchemas(
llm=self.llm,
Expand All @@ -47,15 +47,11 @@ def parse_text_to_data_with_schemas(
return self

def disambiguate_data(self):
self.parse_text_to_kg.append(
DisambiguateData(llm=self.llm, is_user_schema=False)
)
self.parse_text_to_kg.append(DisambiguateData(llm=self.llm, is_user_schema=False))
return self

def disambiguate_data_with_schemas(self):
self.parse_text_to_kg.append(
DisambiguateData(llm=self.llm, is_user_schema=True)
)
self.parse_text_to_kg.append(DisambiguateData(llm=self.llm, is_user_schema=True))
return self

def commit_data_to_kg(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,23 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.


import os
from pyhugegraph.client import PyHugeClient


def generate_new_relationships(nodes_schemas_data, relationships_data):
label_id = dict()
label_id = {}
i = 1
old_label = []
for item in nodes_schemas_data:
label = item["label"]
if label in old_label:
continue
else:
label_id[label] = i
i += 1
old_label.append(label)
label_id[label] = i
i += 1
old_label.append(label)
new_relationships_data = []
for relationship in relationships_data:
start = relationship["start"]
Expand All @@ -45,7 +46,7 @@ def generate_new_relationships(nodes_schemas_data, relationships_data):
for key1, value1 in end.items():
if key1 == key:
new_end = f"{value}" + ":" + f"{value1}"
relationships_data = dict()
relationships_data = {}
relationships_data["start"] = new_start
relationships_data["end"] = new_end
relationships_data["type"] = relationships_type
Expand Down Expand Up @@ -91,7 +92,7 @@ def generate_schema_nodes(data):
properties = item["properties"]
schema_statement = f"schema.vertexLabel('{label}').properties("
schema_statement += ", ".join(f"'{prop}'" for prop in properties.keys())
schema_statement += f").nullableKeys("
schema_statement += ").nullableKeys("
schema_statement += ", ".join(
f"'{prop}'" for prop in properties.keys() if prop != primary_key
)
Expand All @@ -109,11 +110,14 @@ def generate_schema_relationships(data):
end = item["end"]
schema_relationships_type = item["type"]
properties = item["properties"]
schema_statement = f"schema.edgeLabel('{schema_relationships_type}').sourceLabel('{start}').targetLabel('{end}').properties("
schema_statement = (
f"schema.edgeLabel('{schema_relationships_type}')"
f".sourceLabel('{start}').targetLabel('{end}').properties("
)
schema_statement += ", ".join(f"'{prop}'" for prop in properties.keys())
schema_statement += f").nullableKeys("
schema_statement += ").nullableKeys("
schema_statement += ", ".join(f"'{prop}'" for prop in properties.keys())
schema_statement += f").ifNotExist().create()"
schema_statement += ").ifNotExist().create()"
schema_relationships_statements.append(schema_statement)
return schema_relationships_statements

Expand Down Expand Up @@ -153,12 +157,9 @@ def run(self, data: dict):
relationships = data["relationships"]
nodes_schemas = data["nodes_schemas"]
relationships_schemas = data["relationships_schemas"]
schema = self.schema
# properties schema
schema_nodes_properties = generate_schema_properties(nodes_schemas)
schema_relationships_properties = generate_schema_properties(
relationships_schemas
)
schema_relationships_properties = generate_schema_properties(relationships_schemas)
for schema_nodes_property in schema_nodes_properties:
exec(schema_nodes_property)

Expand All @@ -175,7 +176,6 @@ def run(self, data: dict):
for schema_relationship in schema_relationships:
exec(schema_relationship)

g = self.client.graph()
# nodes
nodes = generate_nodes(nodes)
for node in nodes:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.


import json
import re
from itertools import groupby
Expand Down Expand Up @@ -102,7 +104,7 @@ def generate_prompt(data) -> str:
"""


internalRegex = r"\[(.*?)\]"
INTERNAL_REGEX = r"\[(.*?)\]"


class DisambiguateData:
Expand Down Expand Up @@ -144,7 +146,7 @@ def run(self, data: dict) -> dict[str, list[any]]:
{"role": "user", "content": generate_prompt(dis_string)},
]
raw_nodes = self.llm.generate(messages)
n = re.findall(internalRegex, raw_nodes)
n = re.findall(INTERNAL_REGEX, raw_nodes)
new_nodes.extend(nodes_text_to_list_of_dict(n))

relationship_data = ""
Expand Down Expand Up @@ -172,7 +174,7 @@ def run(self, data: dict) -> dict[str, list[any]]:
{"role": "user", "content": generate_prompt(relationship_data)},
]
raw_relationships = self.llm.generate(messages)
rels = re.findall(internalRegex, raw_relationships)
rels = re.findall(INTERNAL_REGEX, raw_relationships)
new_relationships.extend(relationships_text_to_list_of_dict(rels))

if not self.is_user_schema:
Expand All @@ -193,7 +195,7 @@ def run(self, data: dict) -> dict[str, list[any]]:
{"role": "user", "content": generate_prompt(nodes_schemas_data)},
]
raw_nodes_schemas = self.llm.generate(messages)
n = re.findall(internalRegex, raw_nodes_schemas)
n = re.findall(INTERNAL_REGEX, raw_nodes_schemas)
new_nodes_schemas.extend(nodes_schemas_text_to_list_of_dict(n))

relationships_schemas_data = ""
Expand All @@ -210,12 +212,8 @@ def run(self, data: dict) -> dict[str, list[any]]:
+ "]\n"
)

node_schemas_labels = [
nodes_schemas["label"] for nodes_schemas in new_nodes_schemas
]
relationships_schemas_data += "Valid Labels:\n" + "\n".join(
node_schemas_labels
)
node_schemas_labels = [nodes_schemas["label"] for nodes_schemas in new_nodes_schemas]
relationships_schemas_data += "Valid Labels:\n" + "\n".join(node_schemas_labels)

messages = [
{
Expand All @@ -228,7 +226,7 @@ def run(self, data: dict) -> dict[str, list[any]]:
},
]
raw_relationships_schemas = self.llm.generate(messages)
schemas_rels = re.findall(internalRegex, raw_relationships_schemas)
schemas_rels = re.findall(INTERNAL_REGEX, raw_relationships_schemas)
new_relationships_schemas.extend(
relationships_schemas_text_to_list_of_dict(schemas_rels)
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.


import re
from typing import List

Expand Down Expand Up @@ -89,8 +91,7 @@ def split_string_to_fit_token_space(
current_chunk = ""
for chunk in chunked_data:
if (
llm.num_tokens_from_string(current_chunk)
+ llm.num_tokens_from_string(chunk)
llm.num_tokens_from_string(current_chunk) + llm.num_tokens_from_string(chunk)
< allowed_tokens
):
current_chunk += chunk
Expand Down Expand Up @@ -123,10 +124,8 @@ def get_nodes_and_relationships_from_result(result):
nodes.extend(re.findall(internal_regex, raw_nodes))
relationships.extend(re.findall(internal_regex, raw_relationships))
nodes_schemas.extend(re.findall(internal_regex, raw_nodes_schemas))
relationships_schemas.extend(
re.findall(internal_regex, raw_relationships_schemas)
)
result = dict()
relationships_schemas.extend(re.findall(internal_regex, raw_relationships_schemas))
result = {}
result["nodes"] = []
result["relationships"] = []
result["nodes_schemas"] = []
Expand Down Expand Up @@ -159,9 +158,7 @@ def process(self, chunk):
def run(self, data: dict) -> dict[str, list[any]]:
system_message = generate_system_message()
prompt_string = generate_prompt("")
token_usage_per_prompt = self.llm.num_tokens_from_string(
system_message + prompt_string
)
token_usage_per_prompt = self.llm.num_tokens_from_string(system_message + prompt_string)
chunked_data = split_string_to_fit_token_space(
llm=self.llm, string=self.text, token_use_per_string=token_usage_per_prompt
)
Expand All @@ -178,9 +175,7 @@ def run(self, data: dict) -> dict[str, list[any]]:
class ParseTextToDataWithSchemas:
llm: BaseLLM

def __init__(
self, llm: BaseLLM, text: str, nodes_schema, relationships_schemas
) -> None:
def __init__(self, llm: BaseLLM, text: str, nodes_schema, relationships_schemas) -> None:
self.llm = llm
self.text = text
self.data = {}
Expand All @@ -204,9 +199,7 @@ def process_with_schemas(self, chunk):
def run(self) -> dict[str, list[any]]:
system_message = generate_system_message_with_schemas()
prompt_string = generate_prompt_with_schemas("", "", "")
token_usage_per_prompt = self.llm.num_tokens_from_string(
system_message + prompt_string
)
token_usage_per_prompt = self.llm.num_tokens_from_string(system_message + prompt_string)
chunked_data = split_string_to_fit_token_space(
llm=self.llm, string=self.text, token_use_per_string=token_usage_per_prompt
)
Expand Down
Loading